4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2015, Joyent, Inc. All rights reserved.
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/sunddi.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 #include <sys/policy.h>
55 #include <sys/limits.h>
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
60 #include <sys/isa_defs.h>
61 #include <sys/inttypes.h>
62 #include <sys/systm.h>
63 #include <sys/cpuvar.h>
64 #include <sys/filio.h>
65 #include <sys/sendfile.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_kpm.h>
71 #include "sockcommon.h"
72 #include "sockfilter_impl.h"
76 int do_useracc
= 1; /* Controlled by setting SO_DEBUG to 4 */
79 #endif /* SOCK_TEST */
81 extern int xnet_truncate_print
;
84 * Kernel component of socket creation.
86 * First the library calls this with a NULL devpath. If this fails
87 * to find a transport (using solookup) the library will look in /etc/netconfig
88 * for the appropriate transport. If one is found it will pass in the
89 * devpath for the kernel to use.
92 so_socket(int family
, int type_w_flags
, int protocol
, char *devpath
)
101 type
= type_w_flags
& SOCK_TYPE_MASK
;
102 type_w_flags
&= ~SOCK_TYPE_MASK
;
103 if (type_w_flags
& ~(SOCK_CLOEXEC
|SOCK_NDELAY
|SOCK_NONBLOCK
))
104 return (set_errno(EINVAL
));
106 if (devpath
!= NULL
) {
108 size_t kdevpathlen
= 0;
110 buf
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
111 if ((error
= copyinstr(devpath
, buf
,
112 MAXPATHLEN
, &kdevpathlen
)) != 0) {
113 kmem_free(buf
, MAXPATHLEN
);
114 return (set_errno(error
));
116 so
= socket_create(family
, type
, protocol
, buf
, NULL
,
117 SOCKET_SLEEP
, CRED(), &error
);
118 kmem_free(buf
, MAXPATHLEN
);
120 so
= socket_create(family
, type
, protocol
, NULL
, NULL
,
121 SOCKET_SLEEP
, CRED(), &error
);
124 return (set_errno(error
));
126 /* Allocate a file descriptor for the socket */
128 if (error
= falloc(vp
, FWRITE
|FREAD
, &fp
, &fd
)) {
129 (void) socket_close(so
, 0, CRED());
131 return (set_errno(error
));
135 * Now fill in the entries that falloc reserved
137 if (type_w_flags
& SOCK_NDELAY
) {
138 so
->so_state
|= SS_NDELAY
;
139 fp
->f_flag
|= FNDELAY
;
141 if (type_w_flags
& SOCK_NONBLOCK
) {
142 so
->so_state
|= SS_NONBLOCK
;
143 fp
->f_flag
|= FNONBLOCK
;
145 mutex_exit(&fp
->f_tlock
);
147 if ((type_w_flags
& SOCK_CLOEXEC
) != 0) {
148 f_setfd(fd
, FD_CLOEXEC
);
155 * Map from a file descriptor to a socket node.
156 * Returns with the file descriptor held i.e. the caller has to
157 * use releasef when done with the file descriptor.
160 getsonode(int sock
, int *errorp
, file_t
**fpp
)
166 if ((fp
= getf(sock
)) == NULL
) {
172 /* Check if it is a socket */
173 if (vp
->v_type
!= VSOCK
) {
180 * Use the stream head to find the real socket vnode.
181 * This is needed when namefs sits above sockfs.
184 ASSERT(vp
->v_stream
->sd_vnode
);
185 vp
= vp
->v_stream
->sd_vnode
;
188 if (so
->so_is_stream
) {
191 eprintsoline(so
, *errorp
);
203 * Allocate and copyin a sockaddr.
204 * Ensures NULL termination for AF_UNIX addresses by extending them
205 * with one NULL byte if need be. Verifies that the length is not
206 * excessive to prevent an application from consuming all of kernel
207 * memory. Returns NULL when an error occurred.
209 static struct sockaddr
*
210 copyin_name(struct sonode
*so
, struct sockaddr
*name
, socklen_t
*namelenp
,
214 size_t namelen
= (size_t)*namelenp
;
216 ASSERT(namelen
!= 0);
217 if (namelen
> SO_MAXARGSIZE
) {
219 eprintsoline(so
, *errorp
);
223 faddr
= kmem_alloc(namelen
, KM_SLEEP
);
224 if (copyin(name
, faddr
, namelen
)) {
225 kmem_free(faddr
, namelen
);
227 eprintsoline(so
, *errorp
);
232 * Add space for NULL termination if needed.
233 * Do a quick check if the last byte is NUL.
235 if (so
->so_family
== AF_UNIX
&& faddr
[namelen
- 1] != '\0') {
236 /* Check if there is any NULL termination */
240 for (i
= sizeof (name
->sa_family
); i
< namelen
; i
++) {
241 if (faddr
[i
] == '\0') {
247 /* Add extra byte for NUL padding */
250 nfaddr
= kmem_alloc(namelen
+ 1, KM_SLEEP
);
251 bcopy(faddr
, nfaddr
, namelen
);
252 kmem_free(faddr
, namelen
);
255 nfaddr
[namelen
] = '\0';
257 ASSERT((socklen_t
)namelen
== namelen
);
258 *namelenp
= (socklen_t
)namelen
;
262 return ((struct sockaddr
*)faddr
);
266 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
269 copyout_arg(void *uaddr
, socklen_t ulen
, void *ulenp
, void *kaddr
,
277 if (copyout(kaddr
, uaddr
, ulen
))
284 if (copyout(&ulen
, ulenp
, sizeof (ulen
)))
291 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
292 * If klen is greater than ulen it still uses the non-truncated
293 * klen to update ulenp.
296 copyout_name(void *uaddr
, socklen_t ulen
, void *ulenp
, void *kaddr
,
302 else if (ulen
!= 0 && xnet_truncate_print
) {
303 printf("sockfs: truncating copyout of address using "
304 "XNET semantics for pid = %d. Lengths %d, %d\n",
305 curproc
->p_pid
, klen
, ulen
);
309 if (copyout(kaddr
, uaddr
, ulen
))
317 if (copyout(&klen
, ulenp
, sizeof (klen
)))
324 * The socketpair() code in libsocket creates two sockets (using
325 * the /etc/netconfig fallback if needed) before calling this routine
326 * to connect the two sockets together.
328 * For a SOCK_STREAM socketpair a listener is needed - in that case this
329 * routine will create a new file descriptor as part of accepting the
330 * connection. The library socketpair() will check if svs[2] has changed
331 * in which case it will close the changed fd.
333 * Note that this code could use the TPI feature of accepting the connection
334 * on the listening endpoint. However, that would require significant changes
338 so_socketpair(int sv
[2])
341 struct sonode
*so1
, *so2
;
344 struct sockaddr_ux
*name
;
349 dprint(1, ("so_socketpair(%p)\n", (void *)sv
));
351 error
= useracc(sv
, sizeof (svs
), B_WRITE
);
352 if (error
&& do_useracc
)
353 return (set_errno(EFAULT
));
355 if (copyin(sv
, svs
, sizeof (svs
)))
356 return (set_errno(EFAULT
));
358 if ((so1
= getsonode(svs
[0], &error
, NULL
)) == NULL
)
359 return (set_errno(error
));
361 if ((so2
= getsonode(svs
[1], &error
, NULL
)) == NULL
) {
363 return (set_errno(error
));
366 if (so1
->so_family
!= AF_UNIX
|| so2
->so_family
!= AF_UNIX
) {
375 * The code below makes assumptions about the "sockfs" implementation.
376 * So make sure that the correct implementation is really used.
378 ASSERT(so1
->so_ops
== &sotpi_sonodeops
);
379 ASSERT(so2
->so_ops
== &sotpi_sonodeops
);
381 if (so1
->so_type
== SOCK_DGRAM
) {
383 * Bind both sockets and connect them with each other.
384 * Need to allocate name/namelen for soconnect.
386 error
= socket_bind(so1
, NULL
, 0, _SOBIND_UNSPEC
, CRED());
388 eprintsoline(so1
, error
);
391 error
= socket_bind(so2
, NULL
, 0, _SOBIND_UNSPEC
, CRED());
393 eprintsoline(so2
, error
);
396 namelen
= sizeof (struct sockaddr_ux
);
397 name
= kmem_alloc(namelen
, KM_SLEEP
);
398 name
->sou_family
= AF_UNIX
;
399 name
->sou_addr
= sti2
->sti_ux_laddr
;
400 error
= socket_connect(so1
,
401 (struct sockaddr
*)name
,
403 0, _SOCONNECT_NOXLATE
, CRED());
405 kmem_free(name
, namelen
);
406 eprintsoline(so1
, error
);
409 name
->sou_addr
= sti1
->sti_ux_laddr
;
410 error
= socket_connect(so2
,
411 (struct sockaddr
*)name
,
413 0, _SOCONNECT_NOXLATE
, CRED());
414 kmem_free(name
, namelen
);
416 eprintsoline(so2
, error
);
423 * Bind both sockets, with so1 being a listener.
424 * Connect so2 to so1 - nonblocking to avoid waiting for
425 * soaccept to complete.
426 * Accept a connection on so1. Pass out the new fd as sv[0].
427 * The library will detect the changed fd and close
436 * We could simply call socket_listen() here (which would do the
437 * binding automatically) if the code didn't rely on passing
438 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
440 error
= socket_bind(so1
, NULL
, 0, _SOBIND_UNSPEC
|
441 _SOBIND_NOXLATE
|_SOBIND_LISTEN
|_SOBIND_SOCKETPAIR
,
444 eprintsoline(so1
, error
);
447 error
= socket_bind(so2
, NULL
, 0, _SOBIND_UNSPEC
, CRED());
449 eprintsoline(so2
, error
);
453 namelen
= sizeof (struct sockaddr_ux
);
454 name
= kmem_alloc(namelen
, KM_SLEEP
);
455 name
->sou_family
= AF_UNIX
;
456 name
->sou_addr
= sti1
->sti_ux_laddr
;
457 error
= socket_connect(so2
,
458 (struct sockaddr
*)name
,
460 FNONBLOCK
, _SOCONNECT_NOXLATE
, CRED());
461 kmem_free(name
, namelen
);
463 if (error
!= EINPROGRESS
) {
464 eprintsoline(so2
, error
); goto done
;
468 error
= socket_accept(so1
, 0, CRED(), &nso
);
470 eprintsoline(so1
, error
);
474 /* wait for so2 being SS_CONNECTED ignoring signals */
475 mutex_enter(&so2
->so_lock
);
476 error
= sowaitconnected(so2
, 0, 1);
477 mutex_exit(&so2
->so_lock
);
479 (void) socket_close(nso
, 0, CRED());
481 eprintsoline(so2
, error
);
486 if (error
= falloc(nvp
, FWRITE
|FREAD
, &nfp
, &nfd
)) {
487 (void) socket_close(nso
, 0, CRED());
489 eprintsoline(nso
, error
);
493 * copy over FNONBLOCK and FNDELAY flags should they exist
495 if (so1
->so_state
& SS_NONBLOCK
)
496 nfp
->f_flag
|= FNONBLOCK
;
497 if (so1
->so_state
& SS_NDELAY
)
498 nfp
->f_flag
|= FNDELAY
;
501 * fill in the entries that falloc reserved
503 mutex_exit(&nfp
->f_tlock
);
507 * get the original flags before we release
509 VERIFY(f_getfd_error(svs
[0], &orig_flags
) == 0);
515 * If FD_CLOEXEC was set on the filedescriptor we're
516 * swapping out, we should set it on the new one too.
518 if (orig_flags
& FD_CLOEXEC
) {
519 f_setfd(nfd
, FD_CLOEXEC
);
523 * The socketpair library routine will close the original
524 * svs[0] when this code passes out a different file
529 if (copyout(svs
, sv
, sizeof (svs
))) {
530 (void) closeandsetf(nfd
, NULL
);
532 return (set_errno(EFAULT
));
540 return (set_errno(error
));
544 bind(int sock
, struct sockaddr
*name
, socklen_t namelen
)
549 dprint(1, ("bind(%d, %p, %d)\n",
550 sock
, (void *)name
, namelen
));
552 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
553 return (set_errno(error
));
555 /* Allocate and copyin name */
557 * X/Open test does not expect EFAULT with NULL name and non-zero
560 if (name
!= NULL
&& namelen
!= 0) {
561 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
562 name
= copyin_name(so
, name
, &namelen
, &error
);
565 return (set_errno(error
));
572 error
= socket_bind(so
, name
, namelen
, 0, CRED());
576 kmem_free(name
, (size_t)namelen
);
579 return (set_errno(error
));
584 listen(int sock
, int backlog
)
589 dprint(1, ("listen(%d, %d)\n",
592 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
593 return (set_errno(error
));
595 error
= socket_listen(so
, backlog
, CRED());
599 return (set_errno(error
));
604 accept(int sock
, struct sockaddr
*name
, socklen_t
*namelenp
, int flags
)
615 struct sockaddr
*addrp
;
618 dprint(1, ("accept(%d, %p, %p)\n",
619 sock
, (void *)name
, (void *)namelenp
));
621 if (flags
& ~(SOCK_CLOEXEC
|SOCK_NONBLOCK
|SOCK_NDELAY
)) {
622 return (set_errno(EINVAL
));
625 /* Translate SOCK_ flags to their SS_ variant */
627 if (flags
& SOCK_NONBLOCK
)
628 ssflags
|= SS_NONBLOCK
;
629 if (flags
& SOCK_NDELAY
)
630 ssflags
|= SS_NDELAY
;
632 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
633 return (set_errno(error
));
636 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
637 if (copyin(namelenp
, &namelen
, sizeof (namelen
))) {
639 return (set_errno(EFAULT
));
642 error
= useracc(name
, (size_t)namelen
, B_WRITE
);
643 if (error
&& do_useracc
) {
645 return (set_errno(EFAULT
));
654 * Allocate the user fd before socket_accept() in order to
655 * catch EMFILE errors before calling socket_accept().
657 if ((nfd
= ufalloc(0)) == -1) {
658 eprintsoline(so
, EMFILE
);
660 return (set_errno(EMFILE
));
662 error
= socket_accept(so
, fp
->f_flag
, CRED(), &nso
);
666 return (set_errno(error
));
671 ASSERT(MUTEX_NOT_HELD(&nso
->so_lock
));
673 addrlen
= so
->so_max_addr_len
;
674 addrp
= kmem_alloc(addrlen
, KM_SLEEP
);
676 if ((error
= socket_getpeername(nso
, (struct sockaddr
*)addrp
,
677 &addrlen
, B_TRUE
, CRED())) == 0) {
678 error
= copyout_name(name
, namelen
, namelenp
,
681 ASSERT(error
== EINVAL
|| error
== ENOTCONN
);
682 error
= ECONNABORTED
;
684 kmem_free(addrp
, so
->so_max_addr_len
);
689 (void) socket_close(nso
, 0, CRED());
692 return (set_errno(error
));
694 if (error
= falloc(NULL
, FWRITE
|FREAD
, &nfp
, NULL
)) {
696 (void) socket_close(nso
, 0, CRED());
698 eprintsoline(so
, error
);
700 return (set_errno(error
));
703 * fill in the entries that falloc reserved
706 mutex_exit(&nfp
->f_tlock
);
710 * Act on SOCK_CLOEXEC from flags
712 if (flags
& SOCK_CLOEXEC
) {
713 f_setfd(nfd
, FD_CLOEXEC
);
717 * Copy FNDELAY and FNONBLOCK from listener to acceptor
720 if ((ssflags
| so
->so_state
) & (SS_NDELAY
|SS_NONBLOCK
)) {
721 uint_t oflag
= nfp
->f_flag
;
724 if ((ssflags
| so
->so_state
) & SS_NONBLOCK
)
726 else if ((ssflags
| so
->so_state
) & SS_NDELAY
)
730 * This code is a simplification of the F_SETFL code in fcntl()
731 * Ignore any errors from fop_setfl.
733 if ((error
= fop_setfl(nvp
, oflag
, arg
, nfp
->f_cred
, NULL
))
735 eprintsoline(so
, error
);
738 mutex_enter(&nfp
->f_tlock
);
739 nfp
->f_flag
&= ~FCNTLFLAGS
;
741 mutex_exit(&nfp
->f_tlock
);
749 connect(int sock
, struct sockaddr
*name
, socklen_t namelen
)
755 dprint(1, ("connect(%d, %p, %d)\n",
756 sock
, (void *)name
, namelen
));
758 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
759 return (set_errno(error
));
761 /* Allocate and copyin name */
763 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
764 name
= copyin_name(so
, name
, &namelen
, &error
);
767 return (set_errno(error
));
772 error
= socket_connect(so
, name
, namelen
, fp
->f_flag
, 0, CRED());
775 kmem_free(name
, (size_t)namelen
);
777 return (set_errno(error
));
782 shutdown(int sock
, int how
)
787 dprint(1, ("shutdown(%d, %d)\n",
790 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
791 return (set_errno(error
));
793 error
= socket_shutdown(so
, how
, CRED());
797 return (set_errno(error
));
802 * Common receive routine.
805 recvit(int sock
, struct msghdr
*msg
, struct uio
*uiop
, int flags
,
806 socklen_t
*namelenp
, socklen_t
*controllenp
, int *flagsp
)
813 socklen_t controllen
;
817 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
818 return (set_errno(error
));
820 len
= uiop
->uio_resid
;
821 uiop
->uio_fmode
= fp
->f_flag
;
822 uiop
->uio_extflg
= UIO_COPY_CACHED
;
824 name
= msg
->msg_name
;
825 namelen
= msg
->msg_namelen
;
826 control
= msg
->msg_control
;
827 controllen
= msg
->msg_controllen
;
829 msg
->msg_flags
= flags
& (MSG_OOB
| MSG_PEEK
| MSG_WAITALL
|
832 error
= socket_recvmsg(so
, msg
, uiop
, CRED());
835 return (set_errno(error
));
837 lwp_stat_update(LWP_STAT_MSGRCV
, 1);
840 error
= copyout_name(name
, namelen
, namelenp
,
841 msg
->msg_name
, msg
->msg_namelen
);
845 if (flagsp
!= NULL
) {
847 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
848 * when controllen is zero and there is control data to
851 if (controllen
!= 0 &&
852 (msg
->msg_controllen
> controllen
|| control
== NULL
)) {
853 dprint(1, ("recvit: CTRUNC %d %d %p\n",
854 msg
->msg_controllen
, controllen
, control
));
856 msg
->msg_flags
|= MSG_CTRUNC
;
858 if (copyout(&msg
->msg_flags
, flagsp
,
859 sizeof (msg
->msg_flags
))) {
865 * Note: This MUST be done last. There can be no "goto err" after this
866 * point since it could make so_closefds run twice on some part
867 * of the file descriptor array.
869 if (controllen
!= 0) {
870 error
= copyout_arg(control
, controllen
, controllenp
,
871 msg
->msg_control
, msg
->msg_controllen
);
875 if (msg
->msg_controllen
> controllen
|| control
== NULL
) {
878 so_closefds(msg
->msg_control
, msg
->msg_controllen
,
882 if (msg
->msg_namelen
!= 0)
883 kmem_free(msg
->msg_name
, (size_t)msg
->msg_namelen
);
884 if (msg
->msg_controllen
!= 0)
885 kmem_free(msg
->msg_control
, (size_t)msg
->msg_controllen
);
886 return (len
- uiop
->uio_resid
);
890 * If we fail and the control part contains file descriptors
891 * we have to close the fd's.
893 if (msg
->msg_controllen
!= 0)
894 so_closefds(msg
->msg_control
, msg
->msg_controllen
, 0);
895 if (msg
->msg_namelen
!= 0)
896 kmem_free(msg
->msg_name
, (size_t)msg
->msg_namelen
);
897 if (msg
->msg_controllen
!= 0)
898 kmem_free(msg
->msg_control
, (size_t)msg
->msg_controllen
);
899 return (set_errno(error
));
906 recv(int sock
, void *buffer
, size_t len
, int flags
)
910 struct iovec aiov
[1];
912 dprint(1, ("recv(%d, %p, %ld, %d)\n",
913 sock
, buffer
, len
, flags
));
915 if ((ssize_t
)len
< 0) {
916 return (set_errno(EINVAL
));
919 aiov
[0].iov_base
= buffer
;
920 aiov
[0].iov_len
= len
;
921 auio
.uio_loffset
= 0;
924 auio
.uio_resid
= len
;
925 auio
.uio_segflg
= UIO_USERSPACE
;
928 lmsg
.msg_namelen
= 0;
929 lmsg
.msg_controllen
= 0;
931 return (recvit(sock
, &lmsg
, &auio
, flags
, NULL
, NULL
, NULL
));
935 recvfrom(int sock
, void *buffer
, size_t len
, int flags
, struct sockaddr
*name
,
940 struct iovec aiov
[1];
942 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
943 sock
, buffer
, len
, flags
, (void *)name
, (void *)namelenp
));
945 if ((ssize_t
)len
< 0) {
946 return (set_errno(EINVAL
));
949 aiov
[0].iov_base
= buffer
;
950 aiov
[0].iov_len
= len
;
951 auio
.uio_loffset
= 0;
954 auio
.uio_resid
= len
;
955 auio
.uio_segflg
= UIO_USERSPACE
;
958 lmsg
.msg_name
= (char *)name
;
959 if (namelenp
!= NULL
) {
960 if (copyin(namelenp
, &lmsg
.msg_namelen
,
961 sizeof (lmsg
.msg_namelen
)))
962 return (set_errno(EFAULT
));
964 lmsg
.msg_namelen
= 0;
966 lmsg
.msg_controllen
= 0;
969 return (recvit(sock
, &lmsg
, &auio
, flags
, namelenp
, NULL
, NULL
));
973 recvmsg(int sock
, struct msghdr
*msg
, int flags
)
975 STRUCT_DECL(msghdr
, u_lmsg
);
976 STRUCT_HANDLE(msghdr
, umsgptr
);
979 struct iovec buf
[IOV_MAX_STACK
], *aiov
= buf
;
987 dprint(1, ("recvmsg(%d, %p, %d)\n",
988 sock
, (void *)msg
, flags
));
990 model
= get_udatamodel();
991 STRUCT_INIT(u_lmsg
, model
);
992 STRUCT_SET_HANDLE(umsgptr
, model
, msg
);
994 if (copyin(msg
, STRUCT_BUF(u_lmsg
), STRUCT_SIZE(u_lmsg
)))
995 return (set_errno(EFAULT
));
996 flagsp
= STRUCT_FADDR(umsgptr
, msg_flags
);
999 * Code below us will kmem_alloc memory and hang it
1000 * off msg_control and msg_name fields. This forces
1001 * us to copy the structure to its native form.
1003 lmsg
.msg_name
= STRUCT_FGETP(u_lmsg
, msg_name
);
1004 lmsg
.msg_namelen
= STRUCT_FGET(u_lmsg
, msg_namelen
);
1005 lmsg
.msg_iov
= STRUCT_FGETP(u_lmsg
, msg_iov
);
1006 lmsg
.msg_iovlen
= STRUCT_FGET(u_lmsg
, msg_iovlen
);
1007 lmsg
.msg_control
= STRUCT_FGETP(u_lmsg
, msg_control
);
1008 lmsg
.msg_controllen
= STRUCT_FGET(u_lmsg
, msg_controllen
);
1009 lmsg
.msg_flags
= STRUCT_FGET(u_lmsg
, msg_flags
);
1011 iovcnt
= lmsg
.msg_iovlen
;
1013 if (iovcnt
< 0 || iovcnt
> IOV_MAX
)
1014 return (set_errno(EMSGSIZE
));
1016 if (iovcnt
> IOV_MAX_STACK
) {
1017 iovsize
= iovcnt
* sizeof (struct iovec
);
1018 aiov
= kmem_alloc(iovsize
, KM_SLEEP
);
1021 #ifdef _SYSCALL32_IMPL
1023 * 32-bit callers need to have their iovec expanded, while ensuring
1024 * that they can't move more than 2Gbytes of data in a single call.
1026 if (model
== DATAMODEL_ILP32
) {
1027 struct iovec32 buf32
[IOV_MAX_STACK
], *aiov32
= buf32
;
1031 iov32size
= iovcnt
* sizeof (struct iovec32
);
1033 aiov32
= kmem_alloc(iov32size
, KM_SLEEP
);
1035 if (copyin((struct iovec32
*)lmsg
.msg_iov
, aiov32
, iov32size
)) {
1037 kmem_free(aiov32
, iov32size
);
1038 kmem_free(aiov
, iovsize
);
1041 return (set_errno(EFAULT
));
1045 for (i
= 0; i
< iovcnt
; i
++) {
1048 iovlen32
= aiov32
[i
].iov_len
;
1049 count32
+= iovlen32
;
1050 if (iovlen32
< 0 || count32
< 0) {
1052 kmem_free(aiov32
, iov32size
);
1053 kmem_free(aiov
, iovsize
);
1056 return (set_errno(EINVAL
));
1059 aiov
[i
].iov_len
= iovlen32
;
1061 (caddr_t
)(uintptr_t)aiov32
[i
].iov_base
;
1065 kmem_free(aiov32
, iov32size
);
1067 #endif /* _SYSCALL32_IMPL */
1068 if (copyin(lmsg
.msg_iov
, aiov
, iovcnt
* sizeof (struct iovec
))) {
1070 kmem_free(aiov
, iovsize
);
1072 return (set_errno(EFAULT
));
1075 for (i
= 0; i
< iovcnt
; i
++) {
1076 ssize_t iovlen
= aiov
[i
].iov_len
;
1078 if (iovlen
< 0 || len
< 0) {
1080 kmem_free(aiov
, iovsize
);
1082 return (set_errno(EINVAL
));
1085 auio
.uio_loffset
= 0;
1086 auio
.uio_iov
= aiov
;
1087 auio
.uio_iovcnt
= iovcnt
;
1088 auio
.uio_resid
= len
;
1089 auio
.uio_segflg
= UIO_USERSPACE
;
1092 if (lmsg
.msg_control
!= NULL
&&
1094 useracc(lmsg
.msg_control
, lmsg
.msg_controllen
,
1097 kmem_free(aiov
, iovsize
);
1099 return (set_errno(EFAULT
));
1102 rval
= recvit(sock
, &lmsg
, &auio
, flags
,
1103 STRUCT_FADDR(umsgptr
, msg_namelen
),
1104 STRUCT_FADDR(umsgptr
, msg_controllen
), flagsp
);
1107 kmem_free(aiov
, iovsize
);
1113 * Common send function.
1116 sendit(int sock
, struct msghdr
*msg
, struct uio
*uiop
, int flags
)
1123 socklen_t controllen
;
1127 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
1128 return (set_errno(error
));
1130 uiop
->uio_fmode
= fp
->f_flag
;
1132 if (so
->so_family
== AF_UNIX
)
1133 uiop
->uio_extflg
= UIO_COPY_CACHED
;
1135 uiop
->uio_extflg
= UIO_COPY_DEFAULT
;
1137 /* Allocate and copyin name and control */
1138 name
= msg
->msg_name
;
1139 namelen
= msg
->msg_namelen
;
1140 if (name
!= NULL
&& namelen
!= 0) {
1141 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1142 name
= copyin_name(so
,
1143 (struct sockaddr
*)name
,
1147 /* copyin_name null terminates addresses for AF_UNIX */
1148 msg
->msg_namelen
= namelen
;
1149 msg
->msg_name
= name
;
1151 msg
->msg_name
= name
= NULL
;
1152 msg
->msg_namelen
= namelen
= 0;
1155 control
= msg
->msg_control
;
1156 controllen
= msg
->msg_controllen
;
1157 if ((control
!= NULL
) && (controllen
!= 0)) {
1159 * Verify that the length is not excessive to prevent
1160 * an application from consuming all of kernel memory.
1162 if (controllen
> SO_MAXARGSIZE
) {
1166 control
= kmem_alloc(controllen
, KM_SLEEP
);
1168 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1169 if (copyin(msg
->msg_control
, control
, controllen
)) {
1173 msg
->msg_control
= control
;
1175 msg
->msg_control
= control
= NULL
;
1176 msg
->msg_controllen
= controllen
= 0;
1179 len
= uiop
->uio_resid
;
1180 msg
->msg_flags
= flags
;
1182 error
= socket_sendmsg(so
, msg
, uiop
, CRED());
1184 if (control
!= NULL
)
1185 kmem_free(control
, controllen
);
1188 kmem_free(name
, namelen
);
1192 return (set_errno(error
));
1194 lwp_stat_update(LWP_STAT_MSGSND
, 1);
1196 return (len
- uiop
->uio_resid
);
1200 * Native system call
1203 send(int sock
, void *buffer
, size_t len
, int flags
)
1207 struct iovec aiov
[1];
1209 dprint(1, ("send(%d, %p, %ld, %d)\n",
1210 sock
, buffer
, len
, flags
));
1212 if ((ssize_t
)len
< 0) {
1213 return (set_errno(EINVAL
));
1216 aiov
[0].iov_base
= buffer
;
1217 aiov
[0].iov_len
= len
;
1218 auio
.uio_loffset
= 0;
1219 auio
.uio_iov
= aiov
;
1220 auio
.uio_iovcnt
= 1;
1221 auio
.uio_resid
= len
;
1222 auio
.uio_segflg
= UIO_USERSPACE
;
1225 lmsg
.msg_name
= NULL
;
1226 lmsg
.msg_control
= NULL
;
1227 return (sendit(sock
, &lmsg
, &auio
, flags
));
1231 sendmsg(int sock
, struct msghdr
*msg
, int flags
)
1234 STRUCT_DECL(msghdr
, u_lmsg
);
1236 struct iovec buf
[IOV_MAX_STACK
], *aiov
= buf
;
1237 ssize_t iovsize
= 0;
1243 dprint(1, ("sendmsg(%d, %p, %d)\n", sock
, (void *)msg
, flags
));
1245 model
= get_udatamodel();
1246 STRUCT_INIT(u_lmsg
, model
);
1248 if (copyin(msg
, (char *)STRUCT_BUF(u_lmsg
),
1249 STRUCT_SIZE(u_lmsg
)))
1250 return (set_errno(EFAULT
));
1252 * Code below us will kmem_alloc memory and hang it
1253 * off msg_control and msg_name fields. This forces
1254 * us to copy the structure to its native form.
1256 lmsg
.msg_name
= STRUCT_FGETP(u_lmsg
, msg_name
);
1257 lmsg
.msg_namelen
= STRUCT_FGET(u_lmsg
, msg_namelen
);
1258 lmsg
.msg_iov
= STRUCT_FGETP(u_lmsg
, msg_iov
);
1259 lmsg
.msg_iovlen
= STRUCT_FGET(u_lmsg
, msg_iovlen
);
1260 lmsg
.msg_control
= STRUCT_FGETP(u_lmsg
, msg_control
);
1261 lmsg
.msg_controllen
= STRUCT_FGET(u_lmsg
, msg_controllen
);
1262 lmsg
.msg_flags
= STRUCT_FGET(u_lmsg
, msg_flags
);
1264 iovcnt
= lmsg
.msg_iovlen
;
1266 if (iovcnt
< 0 || iovcnt
> IOV_MAX
)
1267 return (set_errno(EMSGSIZE
));
1269 if (iovcnt
> IOV_MAX_STACK
) {
1270 iovsize
= iovcnt
* sizeof (struct iovec
);
1271 aiov
= kmem_alloc(iovsize
, KM_SLEEP
);
1274 #ifdef _SYSCALL32_IMPL
1276 * 32-bit callers need to have their iovec expanded, while ensuring
1277 * that they can't move more than 2Gbytes of data in a single call.
1279 if (model
== DATAMODEL_ILP32
) {
1280 struct iovec32 buf32
[IOV_MAX_STACK
], *aiov32
= buf32
;
1284 iov32size
= iovcnt
* sizeof (struct iovec32
);
1286 aiov32
= kmem_alloc(iov32size
, KM_SLEEP
);
1289 copyin((struct iovec32
*)lmsg
.msg_iov
, aiov32
, iov32size
)) {
1291 kmem_free(aiov32
, iov32size
);
1292 kmem_free(aiov
, iovsize
);
1295 return (set_errno(EFAULT
));
1299 for (i
= 0; i
< iovcnt
; i
++) {
1302 iovlen32
= aiov32
[i
].iov_len
;
1303 count32
+= iovlen32
;
1304 if (iovlen32
< 0 || count32
< 0) {
1306 kmem_free(aiov32
, iov32size
);
1307 kmem_free(aiov
, iovsize
);
1310 return (set_errno(EINVAL
));
1313 aiov
[i
].iov_len
= iovlen32
;
1315 (caddr_t
)(uintptr_t)aiov32
[i
].iov_base
;
1319 kmem_free(aiov32
, iov32size
);
1321 #endif /* _SYSCALL32_IMPL */
1323 copyin(lmsg
.msg_iov
, aiov
,
1324 (unsigned)iovcnt
* sizeof (struct iovec
))) {
1326 kmem_free(aiov
, iovsize
);
1328 return (set_errno(EFAULT
));
1331 for (i
= 0; i
< iovcnt
; i
++) {
1332 ssize_t iovlen
= aiov
[i
].iov_len
;
1334 if (iovlen
< 0 || len
< 0) {
1336 kmem_free(aiov
, iovsize
);
1338 return (set_errno(EINVAL
));
1341 auio
.uio_loffset
= 0;
1342 auio
.uio_iov
= aiov
;
1343 auio
.uio_iovcnt
= iovcnt
;
1344 auio
.uio_resid
= len
;
1345 auio
.uio_segflg
= UIO_USERSPACE
;
1348 rval
= sendit(sock
, &lmsg
, &auio
, flags
);
1351 kmem_free(aiov
, iovsize
);
1357 sendto(int sock
, void *buffer
, size_t len
, int flags
,
1358 struct sockaddr
*name
, socklen_t namelen
)
1362 struct iovec aiov
[1];
1364 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1365 sock
, buffer
, len
, flags
, (void *)name
, namelen
));
1367 if ((ssize_t
)len
< 0) {
1368 return (set_errno(EINVAL
));
1371 aiov
[0].iov_base
= buffer
;
1372 aiov
[0].iov_len
= len
;
1373 auio
.uio_loffset
= 0;
1374 auio
.uio_iov
= aiov
;
1375 auio
.uio_iovcnt
= 1;
1376 auio
.uio_resid
= len
;
1377 auio
.uio_segflg
= UIO_USERSPACE
;
1380 lmsg
.msg_name
= (char *)name
;
1381 lmsg
.msg_namelen
= namelen
;
1382 lmsg
.msg_control
= NULL
;
1383 return (sendit(sock
, &lmsg
, &auio
, flags
));
1387 getpeername(int sock
, struct sockaddr
*name
, socklen_t
*namelenp
)
1392 socklen_t sock_addrlen
;
1393 struct sockaddr
*sock_addrp
;
1395 dprint(1, ("getpeername(%d, %p, %p)\n",
1396 sock
, (void *)name
, (void *)namelenp
));
1398 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1401 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1402 if (copyin(namelenp
, &namelen
, sizeof (namelen
)) ||
1403 (name
== NULL
&& namelen
!= 0)) {
1407 sock_addrlen
= so
->so_max_addr_len
;
1408 sock_addrp
= kmem_alloc(sock_addrlen
, KM_SLEEP
);
1410 if ((error
= socket_getpeername(so
, sock_addrp
, &sock_addrlen
,
1411 B_FALSE
, CRED())) == 0) {
1412 ASSERT(sock_addrlen
<= so
->so_max_addr_len
);
1413 error
= copyout_name(name
, namelen
, namelenp
,
1414 (void *)sock_addrp
, sock_addrlen
);
1416 kmem_free(sock_addrp
, so
->so_max_addr_len
);
1419 bad
: return (error
!= 0 ? set_errno(error
) : 0);
1423 getsockname(int sock
, struct sockaddr
*name
, socklen_t
*namelenp
)
1427 socklen_t namelen
, sock_addrlen
;
1428 struct sockaddr
*sock_addrp
;
1430 dprint(1, ("getsockname(%d, %p, %p)\n",
1431 sock
, (void *)name
, (void *)namelenp
));
1433 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1436 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1437 if (copyin(namelenp
, &namelen
, sizeof (namelen
)) ||
1438 (name
== NULL
&& namelen
!= 0)) {
1443 sock_addrlen
= so
->so_max_addr_len
;
1444 sock_addrp
= kmem_alloc(sock_addrlen
, KM_SLEEP
);
1445 if ((error
= socket_getsockname(so
, sock_addrp
, &sock_addrlen
,
1447 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1448 ASSERT(sock_addrlen
<= so
->so_max_addr_len
);
1449 error
= copyout_name(name
, namelen
, namelenp
,
1450 (void *)sock_addrp
, sock_addrlen
);
1452 kmem_free(sock_addrp
, so
->so_max_addr_len
);
1455 bad
: return (error
!= 0 ? set_errno(error
) : 0);
1459 getsockopt(int sock
, int level
, int option_name
, void *option_value
,
1460 socklen_t
*option_lenp
)
1463 socklen_t optlen
, optlen_res
;
1467 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1468 sock
, level
, option_name
, option_value
, (void *)option_lenp
));
1470 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1471 return (set_errno(error
));
1473 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1474 if (copyin(option_lenp
, &optlen
, sizeof (optlen
))) {
1476 return (set_errno(EFAULT
));
1479 * Verify that the length is not excessive to prevent
1480 * an application from consuming all of kernel memory.
1482 if (optlen
> SO_MAXARGSIZE
) {
1485 return (set_errno(error
));
1487 optval
= kmem_alloc(optlen
, KM_SLEEP
);
1488 optlen_res
= optlen
;
1489 error
= socket_getsockopt(so
, level
, option_name
, optval
,
1490 &optlen_res
, 0, CRED());
1493 kmem_free(optval
, optlen
);
1494 return (set_errno(error
));
1496 error
= copyout_arg(option_value
, optlen
, option_lenp
,
1497 optval
, optlen_res
);
1498 kmem_free(optval
, optlen
);
1500 return (set_errno(error
));
1505 setsockopt(int sock
, int level
, int option_name
, void *option_value
,
1506 socklen_t option_len
)
1510 void *optval
= NULL
;
1513 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1514 sock
, level
, option_name
, option_value
, option_len
));
1516 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1517 return (set_errno(error
));
1519 if (option_value
!= NULL
) {
1520 if (option_len
!= 0) {
1522 * Verify that the length is not excessive to prevent
1523 * an application from consuming all of kernel memory.
1525 if (option_len
> SO_MAXARGSIZE
) {
1529 optval
= option_len
<= sizeof (buffer
) ?
1530 &buffer
: kmem_alloc((size_t)option_len
, KM_SLEEP
);
1531 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1532 if (copyin(option_value
, optval
, (size_t)option_len
)) {
1540 error
= socket_setsockopt(so
, level
, option_name
, optval
,
1541 (t_uscalar_t
)option_len
, CRED());
1543 if (optval
!= buffer
)
1544 kmem_free(optval
, (size_t)option_len
);
1548 return (set_errno(error
));
1553 sockconf_add_sock(int family
, int type
, int protocol
, char *name
)
1556 char *kdevpath
= NULL
;
1557 char *kmodule
= NULL
;
1560 struct sockparams
*sp
;
1566 * This also makes it possible to check for too long pathnames.
1567 * Compress the space needed for the name before passing it
1568 * to soconfig - soconfig will store the string until
1569 * the configuration is removed.
1571 buf
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
1572 if ((error
= copyinstr(name
, buf
, MAXPATHLEN
, &pathlen
)) != 0) {
1573 kmem_free(buf
, MAXPATHLEN
);
1576 if (strncmp(buf
, "/dev", strlen("/dev")) == 0) {
1578 kdevpath
= kmem_alloc(pathlen
, KM_SLEEP
);
1579 bcopy(buf
, kdevpath
, pathlen
);
1580 kdevpath
[pathlen
- 1] = '\0';
1582 /* For socket module */
1583 kmodule
= kmem_alloc(pathlen
, KM_SLEEP
);
1584 bcopy(buf
, kmodule
, pathlen
);
1585 kmodule
[pathlen
- 1] = '\0';
1588 kmem_free(buf
, MAXPATHLEN
);
1590 /* sockparams_create frees mod name and devpath upon failure */
1591 sp
= sockparams_create(family
, type
, protocol
, kmodule
,
1592 kdevpath
, pathlen
, 0, KM_SLEEP
, &error
);
1594 error
= sockparams_add(sp
);
1596 sockparams_destroy(sp
);
1603 sockconf_remove_sock(int family
, int type
, int protocol
)
1605 return (sockparams_delete(family
, type
, protocol
));
1609 sockconfig_remove_filter(const char *uname
)
1611 char kname
[SOF_MAXNAMELEN
];
1616 if ((error
= copyinstr(uname
, kname
, SOF_MAXNAMELEN
, &len
)) != 0)
1619 ent
= sof_entry_remove_by_name(kname
);
1623 mutex_enter(&ent
->sofe_lock
);
1624 ASSERT(!(ent
->sofe_flags
& SOFEF_CONDEMED
));
1625 if (ent
->sofe_refcnt
== 0) {
1626 mutex_exit(&ent
->sofe_lock
);
1627 sof_entry_free(ent
);
1629 /* let the last socket free the filter */
1630 ent
->sofe_flags
|= SOFEF_CONDEMED
;
1631 mutex_exit(&ent
->sofe_lock
);
1638 sockconfig_add_filter(const char *uname
, void *ufilpropp
)
1640 struct sockconfig_filter_props filprop
;
1643 size_t tuplesz
, len
;
1644 char hintbuf
[SOF_MAXNAMELEN
];
1646 ent
= kmem_zalloc(sizeof (sof_entry_t
), KM_SLEEP
);
1647 mutex_init(&ent
->sofe_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1649 if ((error
= copyinstr(uname
, ent
->sofe_name
, SOF_MAXNAMELEN
,
1651 sof_entry_free(ent
);
1655 if (get_udatamodel() == DATAMODEL_NATIVE
) {
1656 if (copyin(ufilpropp
, &filprop
, sizeof (filprop
)) != 0) {
1657 sof_entry_free(ent
);
1661 #ifdef _SYSCALL32_IMPL
1663 struct sockconfig_filter_props32 filprop32
;
1665 if (copyin(ufilpropp
, &filprop32
, sizeof (filprop32
)) != 0) {
1666 sof_entry_free(ent
);
1669 filprop
.sfp_modname
= (char *)(uintptr_t)filprop32
.sfp_modname
;
1670 filprop
.sfp_autoattach
= filprop32
.sfp_autoattach
;
1671 filprop
.sfp_hint
= filprop32
.sfp_hint
;
1672 filprop
.sfp_hintarg
= (char *)(uintptr_t)filprop32
.sfp_hintarg
;
1673 filprop
.sfp_socktuple_cnt
= filprop32
.sfp_socktuple_cnt
;
1674 filprop
.sfp_socktuple
=
1675 (sof_socktuple_t
*)(uintptr_t)filprop32
.sfp_socktuple
;
1677 #endif /* _SYSCALL32_IMPL */
1679 if ((error
= copyinstr(filprop
.sfp_modname
, ent
->sofe_modname
,
1680 sizeof (ent
->sofe_modname
), &len
)) != 0) {
1681 sof_entry_free(ent
);
1686 * A filter must specify at least one socket tuple.
1688 if (filprop
.sfp_socktuple_cnt
== 0 ||
1689 filprop
.sfp_socktuple_cnt
> SOF_MAXSOCKTUPLECNT
) {
1690 sof_entry_free(ent
);
1693 ent
->sofe_flags
= filprop
.sfp_autoattach
? SOFEF_AUTO
: SOFEF_PROG
;
1694 ent
->sofe_hint
= filprop
.sfp_hint
;
1697 * Verify the hint, and copy in the hint argument, if necessary.
1699 switch (ent
->sofe_hint
) {
1700 case SOF_HINT_BEFORE
:
1701 case SOF_HINT_AFTER
:
1702 if ((error
= copyinstr(filprop
.sfp_hintarg
, hintbuf
,
1703 sizeof (hintbuf
), &len
)) != 0) {
1704 sof_entry_free(ent
);
1707 ent
->sofe_hintarg
= kmem_alloc(len
, KM_SLEEP
);
1708 bcopy(hintbuf
, ent
->sofe_hintarg
, len
);
1711 case SOF_HINT_BOTTOM
:
1712 /* hints cannot be used with programmatic filters */
1713 if (ent
->sofe_flags
& SOFEF_PROG
) {
1714 sof_entry_free(ent
);
1721 /* bad hint value */
1722 sof_entry_free(ent
);
1726 ent
->sofe_socktuple_cnt
= filprop
.sfp_socktuple_cnt
;
1727 tuplesz
= sizeof (sof_socktuple_t
) * ent
->sofe_socktuple_cnt
;
1728 ent
->sofe_socktuple
= kmem_alloc(tuplesz
, KM_SLEEP
);
1730 if (get_udatamodel() == DATAMODEL_NATIVE
) {
1731 if (copyin(filprop
.sfp_socktuple
, ent
->sofe_socktuple
,
1733 sof_entry_free(ent
);
1737 #ifdef _SYSCALL32_IMPL
1740 caddr_t data
= (caddr_t
)filprop
.sfp_socktuple
;
1741 sof_socktuple_t
*tup
= ent
->sofe_socktuple
;
1742 sof_socktuple32_t tup32
;
1744 tup
= ent
->sofe_socktuple
;
1745 for (i
= 0; i
< ent
->sofe_socktuple_cnt
; i
++, tup
++) {
1746 ASSERT(tup
< ent
->sofe_socktuple
+ tuplesz
);
1748 if (copyin(data
, &tup32
, sizeof (tup32
)) != 0) {
1749 sof_entry_free(ent
);
1752 tup
->sofst_family
= tup32
.sofst_family
;
1753 tup
->sofst_type
= tup32
.sofst_type
;
1754 tup
->sofst_protocol
= tup32
.sofst_protocol
;
1756 data
+= sizeof (tup32
);
1759 #endif /* _SYSCALL32_IMPL */
1761 /* Sockets can start using the filter as soon as the filter is added */
1762 if ((error
= sof_entry_add(ent
)) != 0)
1763 sof_entry_free(ent
);
1769 * Socket configuration system call. It is used to add and remove
1773 sockconfig(int cmd
, void *arg1
, void *arg2
, void *arg3
, void *arg4
)
1777 if (secpolicy_net_config(CRED(), B_FALSE
) != 0)
1778 return (set_errno(EPERM
));
1781 case SOCKCONFIG_ADD_SOCK
:
1782 error
= sockconf_add_sock((int)(uintptr_t)arg1
,
1783 (int)(uintptr_t)arg2
, (int)(uintptr_t)arg3
, arg4
);
1785 case SOCKCONFIG_REMOVE_SOCK
:
1786 error
= sockconf_remove_sock((int)(uintptr_t)arg1
,
1787 (int)(uintptr_t)arg2
, (int)(uintptr_t)arg3
);
1789 case SOCKCONFIG_ADD_FILTER
:
1790 error
= sockconfig_add_filter((const char *)arg1
, arg2
);
1792 case SOCKCONFIG_REMOVE_FILTER
:
1793 error
= sockconfig_remove_filter((const char *)arg1
);
1795 case SOCKCONFIG_GET_SOCKTABLE
:
1796 error
= sockparams_copyout_socktable((int)(uintptr_t)arg1
);
1800 cmn_err(CE_NOTE
, "sockconfig: unkonwn subcommand %d", cmd
);
1808 return (set_errno(error
));
1815 * Sendfile is implemented through two schemes, direct I/O or by
1816 * caching in the filesystem page cache. We cache the input file by
1817 * default and use direct I/O only if sendfile_max_size is set
1818 * appropriately as explained below. Note that this logic is consistent
1819 * with other filesystems where caching is turned on by default
1820 * unless explicitly turned off by using the DIRECTIO ioctl.
1822 * We choose a slightly different scheme here. One can turn off
1823 * caching by setting sendfile_max_size to 0. One can also enable
1824 * caching of files <= sendfile_max_size by setting sendfile_max_size
1825 * to an appropriate value. By default sendfile_max_size is set to the
1826 * maximum value so that all files are cached. In future, we may provide
1827 * better interfaces for caching the file.
1829 * Sendfile through Direct I/O (Zero copy)
1830 * --------------------------------------
1832 * As disks are normally slower than the network, we can't have a
1833 * single thread that reads the disk and writes to the network. We
1834 * need to have parallelism. This is done by having the sendfile
1835 * thread create another thread that reads from the filesystem
1836 * and queues it for network processing. In this scheme, the data
1837 * is never copied anywhere i.e it is zero copy unlike the other
1840 * We have a sendfile queue (snfq) where each sendfile
1841 * request (snf_req_t) is queued for processing by a thread. Number
1842 * of threads is dynamically allocated and they exit if they are idling
1843 * beyond a specified amount of time. When each request (snf_req_t) is
1844 * processed by a thread, it produces a number of mblk_t structures to
1845 * be consumed by the sendfile thread. snf_deque and snf_enque are
1846 * used for consuming and producing mblks. Size of the filesystem
1847 * read is determined by the tunable (sendfile_read_size). A single
1848 * mblk holds sendfile_read_size worth of data (except the last
1849 * read of the file) which is sent down as a whole to the network.
1850 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1851 * value for the UFS filesystem backed by a striped storage array.
1853 * Synchronisation between read (producer) and write (consumer) threads.
1854 * --------------------------------------------------------------------
1856 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1857 * adding and deleting items in this list. Error can happen anytime
1858 * during read or write. There could be unprocessed mblks in the
1859 * sr_ib_XXX list when a read or write error occurs. Whenever error
1860 * is encountered, we need two things to happen :
1862 * a) One of the threads need to clean the mblks.
1863 * b) When one thread encounters an error, the other should stop.
1865 * For (a), we don't want to penalize the reader thread as it could do
1866 * some useful work processing other requests. For (b), the error can
1867 * be detected by examining sr_read_error or sr_write_error.
1868 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1869 * writer encounters error, we need to report the write error back to
1870 * the application as that's what would have happened if the operations
1871 * were done sequentially. With this in mind, following should work :
1873 * - Check for errors before read or write.
1874 * - If the reader encounters error, set the error in sr_read_error.
1875 * Check sr_write_error, if it is set, send cv_signal as it is
1876 * waiting for reader to complete. If it is not set, the writer
1877 * is either running sinking data to the network or blocked
1878 * because of flow control. For handling the latter case, we
1879 * always send a signal. In any case, it will examine sr_read_error
1880 * and return. sr_read_error is marked with SR_READ_DONE to tell
1881 * the writer that the reader is done in all the cases.
1882 * - If the writer encounters error, set the error in sr_write_error.
1883 * The reader thread is either blocked because of flow control or
1884 * running reading data from the disk. For the former, we need to
1885 * wakeup the thread. Again to keep it simple, we always wake up
1886 * the reader thread. Then, wait for the read thread to complete
1887 * if it is not done yet. Cleanup and return.
1889 * High and low water marks for the read thread.
1890 * --------------------------------------------
1892 * If sendfile() is used to send data over a slow network, we need to
1893 * make sure that the read thread does not produce data at a faster
1894 * rate than the network. This can happen if the disk is faster than
1895 * the network. In such a case, we don't want to build a very large queue.
1896 * But we would still like to get all of the network throughput possible.
1897 * This implies that network should never block waiting for data.
1898 * As there are lot of disk throughput/network throughput combinations
1899 * possible, it is difficult to come up with an accurate number.
1900 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1901 * latency of 3ms for reading a disk block. Thus, the total latency to
1902 * initiate a new read, transfer data from the disk and queue for
1903 * transmission would take about a max of 25ms. Todays max transfer rate
1904 * for network is 100MB/sec. If the thread is blocked because of flow
1905 * control, it would take 25ms to get new data ready for transmission.
1906 * We have to make sure that network is not idling, while we are initiating
1907 * new transfers. So, at 100MB/sec, to keep network busy we would need
1908 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1909 * We need to pick a high water mark so that the woken up thread would
1910 * do considerable work before blocking again to prevent thrashing. Currently,
1911 * we pick this to be 10 times that of the low water mark.
1913 * Sendfile with segmap caching (One copy from page cache to mblks).
1914 * ----------------------------------------------------------------
1916 * We use the segmap cache for caching the file, if the size of file
1917 * is <= sendfile_max_size. In this case we don't use threads as VM
1918 * is reasonably fast enough to keep up with the network. If the underlying
1919 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1920 * of data into segmap space, and use the virtual address from segmap
1921 * directly through desballoc() to avoid copy. Once the transport is done
1922 * with the data, the mapping will be released through segmap_release()
1923 * called by the call-back routine.
1925 * If zero-copy is not allowed by the transport, we simply call fop_read()
1926 * to copy the data from the filesystem into our temporary network buffer.
1928 * To disable caching, set sendfile_max_size to 0.
1931 uint_t sendfile_read_size
= 1024 * 1024;
1932 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024
1933 uint_t sendfile_req_lowat
= SENDFILE_REQ_LOWAT
;
1934 uint_t sendfile_req_hiwat
= 10 * SENDFILE_REQ_LOWAT
;
1935 struct sendfile_stats sf_stats
;
1936 struct sendfile_queue
*snfq
;
1937 clock_t snfq_timeout
;
1938 off64_t sendfile_max_size
;
1940 static void snf_enque(snf_req_t
*, mblk_t
*);
1941 static mblk_t
*snf_deque(snf_req_t
*);
1946 snfq
= kmem_zalloc(sizeof (struct sendfile_queue
), KM_SLEEP
);
1948 mutex_init(&snfq
->snfq_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1949 cv_init(&snfq
->snfq_cv
, NULL
, CV_DEFAULT
, NULL
);
1950 snfq
->snfq_max_threads
= max_ncpus
;
1951 snfq_timeout
= SNFQ_TIMEOUT
;
1952 /* Cache all files by default. */
1953 sendfile_max_size
= MAXOFFSET_T
;
1957 * Queues a mblk_t for network processing.
1960 snf_enque(snf_req_t
*sr
, mblk_t
*mp
)
1963 mutex_enter(&sr
->sr_lock
);
1964 if (sr
->sr_mp_head
== NULL
) {
1965 sr
->sr_mp_head
= sr
->sr_mp_tail
= mp
;
1966 cv_signal(&sr
->sr_cv
);
1968 sr
->sr_mp_tail
->b_next
= mp
;
1969 sr
->sr_mp_tail
= mp
;
1971 sr
->sr_qlen
+= MBLKL(mp
);
1972 while ((sr
->sr_qlen
> sr
->sr_hiwat
) &&
1973 (sr
->sr_write_error
== 0)) {
1974 sf_stats
.ss_full_waits
++;
1975 cv_wait(&sr
->sr_cv
, &sr
->sr_lock
);
1977 mutex_exit(&sr
->sr_lock
);
1981 * De-queues a mblk_t for network processing.
1984 snf_deque(snf_req_t
*sr
)
1988 mutex_enter(&sr
->sr_lock
);
1990 * If we have encountered an error on read or read is
1991 * completed and no more mblks, return NULL.
1992 * We need to check for NULL sr_mp_head also as
1993 * the reads could have completed and there is
1994 * nothing more to come.
1996 if (((sr
->sr_read_error
& ~SR_READ_DONE
) != 0) ||
1997 ((sr
->sr_read_error
& SR_READ_DONE
) &&
1998 sr
->sr_mp_head
== NULL
)) {
1999 mutex_exit(&sr
->sr_lock
);
2003 * To start with neither SR_READ_DONE is marked nor
2004 * the error is set. When we wake up from cv_wait,
2005 * following are the possibilities :
2007 * a) sr_read_error is zero and mblks are queued.
2008 * b) sr_read_error is set to SR_READ_DONE
2009 * and mblks are queued.
2010 * c) sr_read_error is set to SR_READ_DONE
2012 * d) sr_read_error is set to some error other
2013 * than SR_READ_DONE.
2016 while ((sr
->sr_read_error
== 0) && (sr
->sr_mp_head
== NULL
)) {
2017 sf_stats
.ss_empty_waits
++;
2018 cv_wait(&sr
->sr_cv
, &sr
->sr_lock
);
2020 /* Handle (a) and (b) first - the normal case. */
2021 if (((sr
->sr_read_error
& ~SR_READ_DONE
) == 0) &&
2022 (sr
->sr_mp_head
!= NULL
)) {
2023 mp
= sr
->sr_mp_head
;
2024 sr
->sr_mp_head
= mp
->b_next
;
2025 sr
->sr_qlen
-= MBLKL(mp
);
2026 if (sr
->sr_qlen
< sr
->sr_lowat
)
2027 cv_signal(&sr
->sr_cv
);
2028 mutex_exit(&sr
->sr_lock
);
2032 /* Handle (c) and (d). */
2033 mutex_exit(&sr
->sr_lock
);
2038 * Reads data from the filesystem and queues it for network processing.
2041 snf_async_read(snf_req_t
*sr
)
2057 size
= sr
->sr_file_size
;
2058 fileoff
= sr
->sr_file_off
;
2061 * Ignore the error for filesystems that doesn't support DIRECTIO.
2063 (void) fop_ioctl(fp
->f_vnode
, _FIODIRECTIO
, DIRECTIO_ON
, 0,
2067 if (vp
->v_type
== VSOCK
) {
2071 * Get the extra space to insert a header and a trailer.
2076 wroff
= so
->so_proto_props
.sopp_wroff
;
2077 maxblk
= so
->so_proto_props
.sopp_maxblk
;
2078 extra
= wroff
+ so
->so_proto_props
.sopp_tail
;
2080 wroff
= (int)(stp
->sd_wroff
);
2081 maxblk
= (int)(stp
->sd_maxblk
);
2082 extra
= wroff
+ (int)(stp
->sd_tail
);
2086 while ((size
!= 0) && (sr
->sr_write_error
== 0)) {
2088 iosize
= (int)MIN(sr
->sr_maxpsz
, size
);
2091 * Socket filters can limit the mblk size,
2092 * so limit reads to maxblk if there are
2095 if (vp
->v_type
== VSOCK
&&
2096 so
->so_filter_active
> 0 && maxblk
!= INFPSZ
)
2097 iosize
= (int)MIN(iosize
, maxblk
);
2099 mp
= allocb(iosize
+ extra
, BPRI_MED
);
2105 mp
->b_rptr
+= wroff
;
2107 ret_size
= soreadfile(fp
, mp
->b_rptr
, fileoff
, &error
, iosize
);
2109 /* Error or Reached EOF ? */
2110 if ((error
!= 0) || (ret_size
== 0)) {
2114 mp
->b_wptr
= mp
->b_rptr
+ ret_size
;
2118 fileoff
+= ret_size
;
2120 (void) fop_ioctl(fp
->f_vnode
, _FIODIRECTIO
, DIRECTIO_OFF
, 0,
2122 mutex_enter(&sr
->sr_lock
);
2123 sr
->sr_read_error
= error
;
2124 sr
->sr_read_error
|= SR_READ_DONE
;
2125 cv_signal(&sr
->sr_cv
);
2126 mutex_exit(&sr
->sr_lock
);
2130 snf_async_thread(void)
2133 callb_cpr_t cprinfo
;
2134 clock_t time_left
= 1;
2136 CALLB_CPR_INIT(&cprinfo
, &snfq
->snfq_lock
, callb_generic_cpr
, "snfq");
2138 mutex_enter(&snfq
->snfq_lock
);
2141 * If we didn't find a entry, then block until woken up
2142 * again and then look through the queues again.
2144 while ((sr
= snfq
->snfq_req_head
) == NULL
) {
2145 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
2146 if (time_left
<= 0) {
2147 snfq
->snfq_svc_threads
--;
2148 CALLB_CPR_EXIT(&cprinfo
);
2152 snfq
->snfq_idle_cnt
++;
2154 time_left
= cv_reltimedwait(&snfq
->snfq_cv
,
2155 &snfq
->snfq_lock
, snfq_timeout
, TR_CLOCK_TICK
);
2156 snfq
->snfq_idle_cnt
--;
2158 CALLB_CPR_SAFE_END(&cprinfo
, &snfq
->snfq_lock
);
2160 snfq
->snfq_req_head
= sr
->sr_next
;
2161 snfq
->snfq_req_cnt
--;
2162 mutex_exit(&snfq
->snfq_lock
);
2164 mutex_enter(&snfq
->snfq_lock
);
2170 create_thread(int operation
, struct vnode
*vp
, file_t
*fp
,
2171 uoff_t fileoff
, uoff_t size
)
2176 sr
= (snf_req_t
*)kmem_zalloc(sizeof (snf_req_t
), KM_SLEEP
);
2183 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2184 * stream might be closed before thread returns from snf_async_read.
2186 if (stp
!= NULL
&& stp
->sd_qn_maxpsz
> 0) {
2187 sr
->sr_maxpsz
= MIN(MAXBSIZE
, stp
->sd_qn_maxpsz
);
2189 sr
->sr_maxpsz
= MAXBSIZE
;
2192 sr
->sr_operation
= operation
;
2193 sr
->sr_file_off
= fileoff
;
2194 sr
->sr_file_size
= size
;
2195 sr
->sr_hiwat
= sendfile_req_hiwat
;
2196 sr
->sr_lowat
= sendfile_req_lowat
;
2197 mutex_init(&sr
->sr_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2198 cv_init(&sr
->sr_cv
, NULL
, CV_DEFAULT
, NULL
);
2200 * See whether we need another thread for servicing this
2201 * request. If there are already enough requests queued
2202 * for the threads, create one if not exceeding
2205 mutex_enter(&snfq
->snfq_lock
);
2206 if (snfq
->snfq_req_cnt
>= snfq
->snfq_idle_cnt
&&
2207 snfq
->snfq_svc_threads
< snfq
->snfq_max_threads
) {
2208 (void) thread_create(NULL
, 0, &snf_async_thread
, 0, 0, &p0
,
2209 TS_RUN
, minclsyspri
);
2210 snfq
->snfq_svc_threads
++;
2212 if (snfq
->snfq_req_head
== NULL
) {
2213 snfq
->snfq_req_head
= snfq
->snfq_req_tail
= sr
;
2214 cv_signal(&snfq
->snfq_cv
);
2216 snfq
->snfq_req_tail
->sr_next
= sr
;
2217 snfq
->snfq_req_tail
= sr
;
2219 snfq
->snfq_req_cnt
++;
2220 mutex_exit(&snfq
->snfq_lock
);
2225 snf_direct_io(file_t
*fp
, file_t
*rfp
, uoff_t fileoff
, uoff_t size
,
2239 bzero(&msg
, sizeof (msg
));
2243 if ((sr
= create_thread(READ_OP
, vp
, rfp
, fileoff
, size
)) == NULL
)
2247 * We check for read error in snf_deque. It has to check
2248 * for successful READ_DONE and return NULL, and we might
2249 * as well make an additional check there.
2251 while ((mp
= snf_deque(sr
)) != NULL
) {
2253 if (ISSIG(curthread
, JUSTLOOKING
)) {
2260 error
= socket_sendmblk(VTOSO(vp
), &msg
, fflag
, CRED(), &mp
);
2271 mutex_enter(&sr
->sr_lock
);
2272 sr
->sr_write_error
= error
;
2273 /* Look at the big comments on why we cv_signal here. */
2274 cv_signal(&sr
->sr_cv
);
2276 /* Wait for the reader to complete always. */
2277 while (!(sr
->sr_read_error
& SR_READ_DONE
)) {
2278 cv_wait(&sr
->sr_cv
, &sr
->sr_lock
);
2280 /* If there is no write error, check for read error. */
2282 error
= (sr
->sr_read_error
& ~SR_READ_DONE
);
2287 mp
= sr
->sr_mp_head
;
2288 while (mp
!= NULL
) {
2289 next_mp
= mp
->b_next
;
2295 mutex_exit(&sr
->sr_lock
);
2296 kmem_free(sr
, sizeof (snf_req_t
));
2300 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2301 #define SNF_VPMMAXPGS (VPMMAXPGS/2)
2304 * Maximum no.of elements in the list returned by vpm, including
2305 * NULL for the last entry
2307 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1)
2310 unsigned int snfv_ref
;
2313 struct vmap snfv_vml
[SNF_MAXVMAPS
];
2314 } snf_vmap_desbinfo
;
2322 } snf_smap_desbinfo
;
2325 * The callback function used for vpm mapped mblks called when the last ref of
2326 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2327 * can be the driver too due to lazy reclaim.
2330 snf_vmap_desbfree(snf_vmap_desbinfo
*snfv
)
2332 ASSERT(snfv
->snfv_ref
!= 0);
2333 if (atomic_dec_32_nv(&snfv
->snfv_ref
) == 0) {
2334 vpm_unmap_pages(snfv
->snfv_vml
, S_READ
);
2335 VN_RELE(snfv
->snfv_vp
);
2336 kmem_free(snfv
, sizeof (snf_vmap_desbinfo
));
2341 * The callback function used for segmap'ped mblks called when the last ref of
2342 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2343 * can be the driver too due to lazy reclaim.
2346 snf_smap_desbfree(snf_smap_desbinfo
*snfi
)
2348 if (! IS_KPM_ADDR(snfi
->snfi_base
)) {
2350 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2351 * segmap_kpm as long as the latter never falls back to
2352 * "use_segmap_range". (See segmap_getmapflt().)
2354 * Using S_OTHER saves an redundant hat_setref() in
2357 (void) segmap_fault(kas
.a_hat
, segkmap
,
2358 (caddr_t
)(uintptr_t)(((uintptr_t)snfi
->snfi_base
+
2359 snfi
->snfi_mapoff
) & PAGEMASK
), snfi
->snfi_len
,
2360 F_SOFTUNLOCK
, S_OTHER
);
2362 (void) segmap_release(segkmap
, snfi
->snfi_base
, SM_DONTNEED
);
2363 VN_RELE(snfi
->snfi_vp
);
2364 kmem_free(snfi
, sizeof (*snfi
));
2368 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2369 * When segmap is used, the mblk contains a segmap slot of no more
2372 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2373 * in each iteration and sent by socket_sendmblk until an error occurs or
2374 * the requested size has been transferred. An mblk is esballoca'ed from
2375 * each mapped page and a chain of these mblk is sent to the transport layer.
2376 * vpm will be called to unmap the pages when all mblks have been freed by
2379 * At the end of the whole sendfile() operation, we wait till the data from
2380 * the last mblk is ack'ed by the transport before returning so that the
2381 * caller of sendfile() can safely modify the file content.
2383 * The caller of this function should make sure that total_size does not exceed
2384 * the actual file size of fvp.
2387 snf_segmap(file_t
*fp
, vnode_t
*fvp
, uoff_t fileoff
, uoff_t total_size
,
2388 ssize_t
*count
, boolean_t nowait
)
2396 clock_t deadlk_wait
;
2400 boolean_t dowait
= B_FALSE
;
2406 bzero(&msg
, sizeof (msg
));
2409 if (ISSIG(curthread
, JUSTLOOKING
)) {
2415 snf_vmap_desbinfo
*snfv
;
2421 mapoff
= fileoff
& PAGEOFFSET
;
2422 maxsize
= MIN((SNF_VPMMAXPGS
* PAGESIZE
), total_size
);
2424 snfv
= kmem_zalloc(sizeof (snf_vmap_desbinfo
),
2428 * Get vpm mappings for maxsize with read access.
2429 * If the pages aren't available yet, we get
2430 * DEADLK, so wait and try again a little later using
2431 * an increasing wait. We might be here a long time.
2433 * If delay_sig returns EINTR, be sure to exit and
2434 * pass it up to the caller.
2437 while ((error
= vpm_map_pages(fvp
, fileoff
,
2438 (size_t)maxsize
, (VPM_FETCHPAGE
), snfv
->snfv_vml
,
2439 SNF_MAXVMAPS
, NULL
, S_READ
)) == EDEADLK
) {
2440 deadlk_wait
+= (deadlk_wait
< 5) ? 1 : 4;
2441 if ((error
= delay_sig(deadlk_wait
)) != 0) {
2446 kmem_free(snfv
, sizeof (snf_vmap_desbinfo
));
2447 error
= (error
== EINTR
) ? EINTR
: EIO
;
2450 snfv
->snfv_frtn
.free_func
= snf_vmap_desbfree
;
2451 snfv
->snfv_frtn
.free_arg
= (caddr_t
)snfv
;
2453 /* Construct the mblk chain from the page mappings */
2455 for (i
= 0; (snfv
->snfv_vml
[i
].vs_addr
!= NULL
) &&
2456 total_size
> 0; i
++) {
2457 ASSERT(chain_size
< maxsize
);
2458 mblk_size
= MIN(snfv
->snfv_vml
[i
].vs_len
-
2459 mapoff
, total_size
);
2461 (uchar_t
*)snfv
->snfv_vml
[i
].vs_addr
+
2462 mapoff
, mblk_size
, BPRI_HI
,
2466 * We return EAGAIN after unmapping the pages
2467 * if we cannot allocate the the head of the
2468 * chain. Otherwise, we continue sending the
2469 * mblks constructed so far.
2473 vpm_unmap_pages(snfv
->snfv_vml
,
2476 sizeof (snf_vmap_desbinfo
));
2482 /* Mark this dblk with the zero-copy flag */
2483 nmp
->b_datap
->db_struioflag
|= STRUIO_ZC
;
2484 nmp
->b_wptr
+= mblk_size
;
2485 chain_size
+= mblk_size
;
2486 fileoff
+= mblk_size
;
2487 total_size
-= mblk_size
;
2496 snfv
->snfv_vp
= fvp
;
2498 /* vpm not supported. fallback to segmap */
2499 snf_smap_desbinfo
*snfi
;
2501 mapoff
= fileoff
& MAXBOFFSET
;
2502 chain_size
= MAXBSIZE
- mapoff
;
2503 if (chain_size
> total_size
)
2504 chain_size
= total_size
;
2506 * we don't forcefault because we'll call
2507 * segmap_fault(F_SOFTLOCK) next.
2509 * S_READ will get the ref bit set (by either
2510 * segmap_getmapflt() or segmap_fault()) and page
2513 base
= segmap_getmapflt(segkmap
, fvp
, fileoff
,
2514 chain_size
, segmap_kpm
? SM_FAULT
: 0, S_READ
);
2516 snfi
= kmem_alloc(sizeof (*snfi
), KM_SLEEP
);
2517 snfi
->snfi_len
= (size_t)roundup(mapoff
+chain_size
,
2518 PAGESIZE
)- (mapoff
& PAGEMASK
);
2520 * We must call segmap_fault() even for segmap_kpm
2521 * because that's how error gets returned.
2522 * (segmap_getmapflt() never fails but segmap_fault()
2525 * If the pages aren't available yet, we get
2526 * DEADLK, so wait and try again a little later using
2527 * an increasing wait. We might be here a long time.
2529 * If delay_sig returns EINTR, be sure to exit and
2530 * pass it up to the caller.
2533 while ((error
= FC_ERRNO(segmap_fault(kas
.a_hat
,
2534 segkmap
, (caddr_t
)(uintptr_t)(((uintptr_t)base
+
2535 mapoff
) & PAGEMASK
), snfi
->snfi_len
, F_SOFTLOCK
,
2536 S_READ
))) == EDEADLK
) {
2537 deadlk_wait
+= (deadlk_wait
< 5) ? 1 : 4;
2538 if ((error
= delay_sig(deadlk_wait
)) != 0) {
2543 (void) segmap_release(segkmap
, base
, 0);
2544 kmem_free(snfi
, sizeof (*snfi
));
2545 error
= (error
== EINTR
) ? EINTR
: EIO
;
2548 snfi
->snfi_frtn
.free_func
= snf_smap_desbfree
;
2549 snfi
->snfi_frtn
.free_arg
= (caddr_t
)snfi
;
2550 snfi
->snfi_base
= base
;
2551 snfi
->snfi_mapoff
= mapoff
;
2552 mp
= esballoca((uchar_t
*)base
+ mapoff
, chain_size
,
2553 BPRI_HI
, &snfi
->snfi_frtn
);
2556 (void) segmap_fault(kas
.a_hat
, segkmap
,
2557 (caddr_t
)(uintptr_t)(((uintptr_t)base
+
2558 mapoff
) & PAGEMASK
), snfi
->snfi_len
,
2559 F_SOFTUNLOCK
, S_OTHER
);
2560 (void) segmap_release(segkmap
, base
, 0);
2561 kmem_free(snfi
, sizeof (*snfi
));
2567 snfi
->snfi_vp
= fvp
;
2568 mp
->b_wptr
+= chain_size
;
2570 /* Mark this dblk with the zero-copy flag */
2571 mp
->b_datap
->db_struioflag
|= STRUIO_ZC
;
2572 fileoff
+= chain_size
;
2573 total_size
-= chain_size
;
2576 if (total_size
== 0 && !nowait
) {
2579 mp
->b_datap
->db_struioflag
|= STRUIO_ZCNOTIFY
;
2581 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2582 error
= socket_sendmblk(VTOSO(vp
), &msg
, fflag
, CRED(), &mp
);
2585 * mp contains the mblks that were not sent by
2586 * socket_sendmblk. Use its size to update *count
2588 *count
= ksize
+ (chain_size
- msgdsize(mp
));
2593 ksize
+= chain_size
;
2594 if (total_size
== 0)
2597 (void) fop_rwlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2598 va
.va_mask
= AT_SIZE
;
2599 error
= fop_getattr(fvp
, &va
, 0, kcred
, NULL
);
2602 /* Read as much as possible. */
2603 if (fileoff
>= va
.va_size
)
2605 if (total_size
+ fileoff
> va
.va_size
)
2606 total_size
= va
.va_size
- fileoff
;
2609 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2619 error
= so_zcopy_wait(so
);
2621 mutex_enter(&stp
->sd_lock
);
2622 while (!(stp
->sd_flag
& STZCNOTIFY
)) {
2623 if (cv_wait_sig(&stp
->sd_zcopy_wait
,
2624 &stp
->sd_lock
) == 0) {
2629 stp
->sd_flag
&= ~STZCNOTIFY
;
2630 mutex_exit(&stp
->sd_lock
);
2637 snf_cache(file_t
*fp
, vnode_t
*fvp
, uoff_t fileoff
, uoff_t size
,
2638 uint_t maxpsz
, ssize_t
*count
)
2657 if (vp
->v_type
== VSOCK
) {
2661 * Get the extra space to insert a header and a trailer.
2666 wroff
= so
->so_proto_props
.sopp_wroff
;
2667 maxblk
= so
->so_proto_props
.sopp_maxblk
;
2668 extra
= wroff
+ so
->so_proto_props
.sopp_tail
;
2670 wroff
= (int)(stp
->sd_wroff
);
2671 maxblk
= (int)(stp
->sd_maxblk
);
2672 extra
= wroff
+ (int)(stp
->sd_tail
);
2675 bzero(&msg
, sizeof (msg
));
2678 auio
.uio_iov
= &aiov
;
2679 auio
.uio_iovcnt
= 1;
2680 auio
.uio_segflg
= UIO_SYSSPACE
;
2681 auio
.uio_llimit
= MAXOFFSET_T
;
2682 auio
.uio_fmode
= fflag
;
2683 auio
.uio_extflg
= UIO_COPY_CACHED
;
2684 ioflag
= auio
.uio_fmode
& (FSYNC
|FDSYNC
|FRSYNC
);
2685 /* If read sync is not asked for, filter sync flags */
2686 if ((ioflag
& FRSYNC
) == 0)
2687 ioflag
&= ~(FSYNC
|FDSYNC
);
2689 if (ISSIG(curthread
, JUSTLOOKING
)) {
2693 iosize
= (int)MIN(maxpsz
, size
);
2696 * Socket filters can limit the mblk size,
2697 * so limit reads to maxblk if there are
2700 if (vp
->v_type
== VSOCK
&&
2701 so
->so_filter_active
> 0 && maxblk
!= INFPSZ
)
2702 iosize
= (int)MIN(iosize
, maxblk
);
2704 mp
= allocb(iosize
+ extra
, BPRI_MED
);
2710 mp
->b_rptr
+= wroff
;
2712 aiov
.iov_base
= (caddr_t
)mp
->b_rptr
;
2713 aiov
.iov_len
= iosize
;
2714 auio
.uio_loffset
= fileoff
;
2715 auio
.uio_resid
= iosize
;
2717 error
= fop_read(fvp
, &auio
, ioflag
, fp
->f_cred
, NULL
);
2718 iosize
-= auio
.uio_resid
;
2720 if (error
== EINTR
&& iosize
!= 0)
2723 if (error
!= 0 || iosize
== 0) {
2727 mp
->b_wptr
= mp
->b_rptr
+ iosize
;
2729 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2731 error
= socket_sendmblk(VTOSO(vp
), &msg
, fflag
, CRED(), &mp
);
2745 (void) fop_rwlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2746 va
.va_mask
= AT_SIZE
;
2747 error
= fop_getattr(fvp
, &va
, 0, kcred
, NULL
);
2750 /* Read as much as possible. */
2751 if (fileoff
>= va
.va_size
)
2753 else if (size
+ fileoff
> va
.va_size
)
2754 size
= va
.va_size
- fileoff
;
2756 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2762 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2764 * Largefile support for 32 bit applications only.
2767 sosendfile64(file_t
*fp
, file_t
*rfp
, const struct ksendfilevec64
*sfv
,
2771 uoff_t sfv_off
, va_size
;
2772 struct vnode
*vp
, *fvp
, *realvp
;
2777 boolean_t dozcopy
= B_FALSE
;
2780 sfv_len
= (ssize32_t
)sfv
->sfv_len
;
2786 if (sfv_len
== 0) goto out
;
2788 sfv_off
= (uoff_t
)sfv
->sfv_off
;
2790 /* Same checks as in pread */
2791 if (sfv_off
> MAXOFFSET_T
) {
2795 if (sfv_off
+ sfv_len
> MAXOFFSET_T
)
2796 sfv_len
= (ssize32_t
)(MAXOFFSET_T
- sfv_off
);
2799 * There are no more checks on sfv_len. So, we cast it to
2800 * uoff_t and share the snf_direct_io/snf_cache code between
2801 * 32 bit and 64 bit.
2803 * TODO: should do nbl_need_check() like read()?
2805 if (sfv_len
> sendfile_max_size
) {
2806 sf_stats
.ss_file_not_cached
++;
2807 error
= snf_direct_io(fp
, rfp
, sfv_off
, (uoff_t
)sfv_len
,
2812 if (fop_realvp(fvp
, &realvp
, NULL
) == 0)
2815 * Grab the lock as a reader to prevent the file size
2816 * from changing underneath.
2818 (void) fop_rwlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2819 va
.va_mask
= AT_SIZE
;
2820 error
= fop_getattr(fvp
, &va
, 0, kcred
, NULL
);
2821 va_size
= va
.va_size
;
2822 if ((error
!= 0) || (va_size
== 0) || (sfv_off
>= va_size
)) {
2823 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2826 /* Read as much as possible. */
2827 if (sfv_off
+ sfv_len
> va_size
)
2828 sfv_len
= va_size
- sfv_off
;
2833 * When the NOWAIT flag is not set, we enable zero-copy only if the
2834 * transfer size is large enough. This prevents performance loss
2835 * when the caller sends the file piece by piece.
2837 if (sfv_len
>= MAXBSIZE
&& (sfv_len
>= (va_size
>> 1) ||
2838 (sfv
->sfv_flag
& SFV_NOWAIT
) || sfv_len
>= 0x1000000) &&
2839 !vn_has_flocks(fvp
) && !(fvp
->v_flag
& VNOMAP
)) {
2841 copyflag
= stp
!= NULL
? stp
->sd_copyflag
:
2842 VTOSO(vp
)->so_proto_props
.sopp_zcopyflag
;
2843 if ((copyflag
& (STZCVMSAFE
|STZCVMUNSAFE
)) == 0) {
2846 if (socket_setsockopt(VTOSO(vp
), SOL_SOCKET
,
2847 SO_SND_COPYAVOID
, &on
, sizeof (on
), CRED()) == 0)
2850 dozcopy
= copyflag
& STZCVMSAFE
;
2854 sf_stats
.ss_file_segmap
++;
2855 error
= snf_segmap(fp
, fvp
, sfv_off
, (uoff_t
)sfv_len
,
2856 &count
, ((sfv
->sfv_flag
& SFV_NOWAIT
) != 0));
2858 if (vp
->v_type
== VSOCK
&& stp
== NULL
) {
2859 sonode_t
*so
= VTOSO(vp
);
2860 maxpsz
= so
->so_proto_props
.sopp_maxpsz
;
2861 } else if (stp
!= NULL
) {
2862 maxpsz
= stp
->sd_qn_maxpsz
;
2867 if (maxpsz
== INFPSZ
)
2870 maxpsz
= roundup(maxpsz
, MAXBSIZE
);
2871 sf_stats
.ss_file_cached
++;
2872 error
= snf_cache(fp
, fvp
, sfv_off
, (uoff_t
)sfv_len
,
2876 releasef(sfv
->sfv_fd
);
2877 *count32
= (ssize32_t
)count
;
2882 #ifdef _SYSCALL32_IMPL
2884 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2885 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2889 recv32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
)
2891 return (recv(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
));
2895 recvfrom32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
,
2896 caddr32_t name
, caddr32_t namelenp
)
2898 return (recvfrom(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
,
2899 (void *)(uintptr_t)name
, (void *)(uintptr_t)namelenp
));
2903 send32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
)
2905 return (send(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
));
2909 sendto32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
,
2910 caddr32_t name
, socklen_t namelen
)
2912 return (sendto(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
,
2913 (void *)(uintptr_t)name
, namelen
));
2915 #endif /* _SYSCALL32_IMPL */
2918 * Function wrappers (mostly around the sonode switch) for
2919 * backward compatibility.
2923 soaccept(struct sonode
*so
, int fflag
, struct sonode
**nsop
)
2925 return (socket_accept(so
, fflag
, CRED(), nsop
));
2929 sobind(struct sonode
*so
, struct sockaddr
*name
, socklen_t namelen
,
2930 int backlog
, int flags
)
2934 error
= socket_bind(so
, name
, namelen
, flags
, CRED());
2935 if (error
== 0 && backlog
!= 0)
2936 return (socket_listen(so
, backlog
, CRED()));
2942 solisten(struct sonode
*so
, int backlog
)
2944 return (socket_listen(so
, backlog
, CRED()));
2948 soconnect(struct sonode
*so
, struct sockaddr
*name
, socklen_t namelen
,
2949 int fflag
, int flags
)
2951 return (socket_connect(so
, name
, namelen
, fflag
, flags
, CRED()));
2955 sorecvmsg(struct sonode
*so
, struct msghdr
*msg
, struct uio
*uiop
)
2957 return (socket_recvmsg(so
, msg
, uiop
, CRED()));
2961 sosendmsg(struct sonode
*so
, struct msghdr
*msg
, struct uio
*uiop
)
2963 return (socket_sendmsg(so
, msg
, uiop
, CRED()));
2967 soshutdown(struct sonode
*so
, int how
)
2969 return (socket_shutdown(so
, how
, CRED()));
2973 sogetsockopt(struct sonode
*so
, int level
, int option_name
, void *optval
,
2974 socklen_t
*optlenp
, int flags
)
2976 return (socket_getsockopt(so
, level
, option_name
, optval
, optlenp
,
2981 sosetsockopt(struct sonode
*so
, int level
, int option_name
, const void *optval
,
2984 return (socket_setsockopt(so
, level
, option_name
, optval
, optlen
,
2989 * Because this is backward compatibility interface it only needs to be
2990 * able to handle the creation of TPI sockfs sockets.
2993 socreate(struct sockparams
*sp
, int family
, int type
, int protocol
,
3000 so
= sp
->sp_smod_info
->smod_sock_create_func(sp
, family
, type
, protocol
,
3001 SOCKET_SLEEP
, errorp
, CRED());
3003 SOCKPARAMS_DEC_REF(sp
);
3005 if ((*errorp
= SOP_INIT(so
, NULL
, CRED(), SOCKET_SLEEP
)) == 0) {
3006 /* Cannot fail, only bumps so_count */
3007 (void) fop_open(&SOTOV(so
), FREAD
|FWRITE
, CRED(), NULL
);