4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2015, Joyent, Inc. All rights reserved.
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/sunddi.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 #include <sys/policy.h>
55 #include <sys/limits.h>
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
60 #include <sys/isa_defs.h>
61 #include <sys/inttypes.h>
62 #include <sys/systm.h>
63 #include <sys/cpuvar.h>
64 #include <sys/filio.h>
65 #include <sys/sendfile.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_kpm.h>
71 #include "sockcommon.h"
72 #include "sockfilter_impl.h"
76 int do_useracc
= 1; /* Controlled by setting SO_DEBUG to 4 */
79 #endif /* SOCK_TEST */
81 extern int xnet_truncate_print
;
84 * Kernel component of socket creation.
86 * First the library calls this with a NULL devpath. If this fails
87 * to find a transport (using solookup) the library will look in /etc/netconfig
88 * for the appropriate transport. If one is found it will pass in the
89 * devpath for the kernel to use.
92 so_socket(int family
, int type_w_flags
, int protocol
, char *devpath
)
101 type
= type_w_flags
& SOCK_TYPE_MASK
;
102 type_w_flags
&= ~SOCK_TYPE_MASK
;
103 if (type_w_flags
& ~(SOCK_CLOEXEC
|SOCK_NDELAY
|SOCK_NONBLOCK
))
104 return (set_errno(EINVAL
));
106 if (devpath
!= NULL
) {
108 size_t kdevpathlen
= 0;
110 buf
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
111 if ((error
= copyinstr(devpath
, buf
,
112 MAXPATHLEN
, &kdevpathlen
)) != 0) {
113 kmem_free(buf
, MAXPATHLEN
);
114 return (set_errno(error
));
116 so
= socket_create(family
, type
, protocol
, buf
, NULL
,
117 SOCKET_SLEEP
, CRED(), &error
);
118 kmem_free(buf
, MAXPATHLEN
);
120 so
= socket_create(family
, type
, protocol
, NULL
, NULL
,
121 SOCKET_SLEEP
, CRED(), &error
);
124 return (set_errno(error
));
126 /* Allocate a file descriptor for the socket */
128 if (error
= falloc(vp
, FWRITE
|FREAD
, &fp
, &fd
)) {
129 (void) socket_close(so
, 0, CRED());
131 return (set_errno(error
));
135 * Now fill in the entries that falloc reserved
137 if (type_w_flags
& SOCK_NDELAY
) {
138 so
->so_state
|= SS_NDELAY
;
139 fp
->f_flag
|= FNDELAY
;
141 if (type_w_flags
& SOCK_NONBLOCK
) {
142 so
->so_state
|= SS_NONBLOCK
;
143 fp
->f_flag
|= FNONBLOCK
;
145 mutex_exit(&fp
->f_tlock
);
147 if ((type_w_flags
& SOCK_CLOEXEC
) != 0) {
148 f_setfd(fd
, FD_CLOEXEC
);
155 * Map from a file descriptor to a socket node.
156 * Returns with the file descriptor held i.e. the caller has to
157 * use releasef when done with the file descriptor.
160 getsonode(int sock
, int *errorp
, file_t
**fpp
)
166 if ((fp
= getf(sock
)) == NULL
) {
172 /* Check if it is a socket */
173 if (vp
->v_type
!= VSOCK
) {
180 * Use the stream head to find the real socket vnode.
181 * This is needed when namefs sits above sockfs.
184 ASSERT(vp
->v_stream
->sd_vnode
);
185 vp
= vp
->v_stream
->sd_vnode
;
188 if (so
->so_is_stream
) {
191 eprintsoline(so
, *errorp
);
203 * Allocate and copyin a sockaddr.
204 * Ensures NULL termination for AF_UNIX addresses by extending them
205 * with one NULL byte if need be. Verifies that the length is not
206 * excessive to prevent an application from consuming all of kernel
207 * memory. Returns NULL when an error occurred.
209 static struct sockaddr
*
210 copyin_name(struct sonode
*so
, struct sockaddr
*name
, socklen_t
*namelenp
,
214 size_t namelen
= (size_t)*namelenp
;
216 ASSERT(namelen
!= 0);
217 if (namelen
> SO_MAXARGSIZE
) {
219 eprintsoline(so
, *errorp
);
223 faddr
= kmem_alloc(namelen
, KM_SLEEP
);
224 if (copyin(name
, faddr
, namelen
)) {
225 kmem_free(faddr
, namelen
);
227 eprintsoline(so
, *errorp
);
232 * Add space for NULL termination if needed.
233 * Do a quick check if the last byte is NUL.
235 if (so
->so_family
== AF_UNIX
&& faddr
[namelen
- 1] != '\0') {
236 /* Check if there is any NULL termination */
240 for (i
= sizeof (name
->sa_family
); i
< namelen
; i
++) {
241 if (faddr
[i
] == '\0') {
247 /* Add extra byte for NUL padding */
250 nfaddr
= kmem_alloc(namelen
+ 1, KM_SLEEP
);
251 bcopy(faddr
, nfaddr
, namelen
);
252 kmem_free(faddr
, namelen
);
255 nfaddr
[namelen
] = '\0';
257 ASSERT((socklen_t
)namelen
== namelen
);
258 *namelenp
= (socklen_t
)namelen
;
262 return ((struct sockaddr
*)faddr
);
266 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
269 copyout_arg(void *uaddr
, socklen_t ulen
, void *ulenp
,
270 void *kaddr
, socklen_t klen
)
277 if (copyout(kaddr
, uaddr
, ulen
))
284 if (copyout(&ulen
, ulenp
, sizeof (ulen
)))
291 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
292 * If klen is greater than ulen it still uses the non-truncated
293 * klen to update ulenp.
296 copyout_name(void *uaddr
, socklen_t ulen
, void *ulenp
,
297 void *kaddr
, socklen_t klen
)
302 else if (ulen
!= 0 && xnet_truncate_print
) {
303 printf("sockfs: truncating copyout of address using "
304 "XNET semantics for pid = %d. Lengths %d, %d\n",
305 curproc
->p_pid
, klen
, ulen
);
309 if (copyout(kaddr
, uaddr
, ulen
))
317 if (copyout(&klen
, ulenp
, sizeof (klen
)))
324 * The socketpair() code in libsocket creates two sockets (using
325 * the /etc/netconfig fallback if needed) before calling this routine
326 * to connect the two sockets together.
328 * For a SOCK_STREAM socketpair a listener is needed - in that case this
329 * routine will create a new file descriptor as part of accepting the
330 * connection. The library socketpair() will check if svs[2] has changed
331 * in which case it will close the changed fd.
333 * Note that this code could use the TPI feature of accepting the connection
334 * on the listening endpoint. However, that would require significant changes
338 so_socketpair(int sv
[2])
341 struct sonode
*so1
, *so2
;
344 struct sockaddr_ux
*name
;
349 dprint(1, ("so_socketpair(%p)\n", (void *)sv
));
351 error
= useracc(sv
, sizeof (svs
), B_WRITE
);
352 if (error
&& do_useracc
)
353 return (set_errno(EFAULT
));
355 if (copyin(sv
, svs
, sizeof (svs
)))
356 return (set_errno(EFAULT
));
358 if ((so1
= getsonode(svs
[0], &error
, NULL
)) == NULL
)
359 return (set_errno(error
));
361 if ((so2
= getsonode(svs
[1], &error
, NULL
)) == NULL
) {
363 return (set_errno(error
));
366 if (so1
->so_family
!= AF_UNIX
|| so2
->so_family
!= AF_UNIX
) {
375 * The code below makes assumptions about the "sockfs" implementation.
376 * So make sure that the correct implementation is really used.
378 ASSERT(so1
->so_ops
== &sotpi_sonodeops
);
379 ASSERT(so2
->so_ops
== &sotpi_sonodeops
);
381 if (so1
->so_type
== SOCK_DGRAM
) {
383 * Bind both sockets and connect them with each other.
384 * Need to allocate name/namelen for soconnect.
386 error
= socket_bind(so1
, NULL
, 0, _SOBIND_UNSPEC
, CRED());
388 eprintsoline(so1
, error
);
391 error
= socket_bind(so2
, NULL
, 0, _SOBIND_UNSPEC
, CRED());
393 eprintsoline(so2
, error
);
396 namelen
= sizeof (struct sockaddr_ux
);
397 name
= kmem_alloc(namelen
, KM_SLEEP
);
398 name
->sou_family
= AF_UNIX
;
399 name
->sou_addr
= sti2
->sti_ux_laddr
;
400 error
= socket_connect(so1
,
401 (struct sockaddr
*)name
,
403 0, _SOCONNECT_NOXLATE
, CRED());
405 kmem_free(name
, namelen
);
406 eprintsoline(so1
, error
);
409 name
->sou_addr
= sti1
->sti_ux_laddr
;
410 error
= socket_connect(so2
,
411 (struct sockaddr
*)name
,
413 0, _SOCONNECT_NOXLATE
, CRED());
414 kmem_free(name
, namelen
);
416 eprintsoline(so2
, error
);
423 * Bind both sockets, with so1 being a listener.
424 * Connect so2 to so1 - nonblocking to avoid waiting for
425 * soaccept to complete.
426 * Accept a connection on so1. Pass out the new fd as sv[0].
427 * The library will detect the changed fd and close
436 * We could simply call socket_listen() here (which would do the
437 * binding automatically) if the code didn't rely on passing
438 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
440 error
= socket_bind(so1
, NULL
, 0, _SOBIND_UNSPEC
|
441 _SOBIND_NOXLATE
|_SOBIND_LISTEN
|_SOBIND_SOCKETPAIR
,
444 eprintsoline(so1
, error
);
447 error
= socket_bind(so2
, NULL
, 0, _SOBIND_UNSPEC
, CRED());
449 eprintsoline(so2
, error
);
453 namelen
= sizeof (struct sockaddr_ux
);
454 name
= kmem_alloc(namelen
, KM_SLEEP
);
455 name
->sou_family
= AF_UNIX
;
456 name
->sou_addr
= sti1
->sti_ux_laddr
;
457 error
= socket_connect(so2
,
458 (struct sockaddr
*)name
,
460 FNONBLOCK
, _SOCONNECT_NOXLATE
, CRED());
461 kmem_free(name
, namelen
);
463 if (error
!= EINPROGRESS
) {
464 eprintsoline(so2
, error
); goto done
;
468 error
= socket_accept(so1
, 0, CRED(), &nso
);
470 eprintsoline(so1
, error
);
474 /* wait for so2 being SS_CONNECTED ignoring signals */
475 mutex_enter(&so2
->so_lock
);
476 error
= sowaitconnected(so2
, 0, 1);
477 mutex_exit(&so2
->so_lock
);
479 (void) socket_close(nso
, 0, CRED());
481 eprintsoline(so2
, error
);
486 if (error
= falloc(nvp
, FWRITE
|FREAD
, &nfp
, &nfd
)) {
487 (void) socket_close(nso
, 0, CRED());
489 eprintsoline(nso
, error
);
493 * copy over FNONBLOCK and FNDELAY flags should they exist
495 if (so1
->so_state
& SS_NONBLOCK
)
496 nfp
->f_flag
|= FNONBLOCK
;
497 if (so1
->so_state
& SS_NDELAY
)
498 nfp
->f_flag
|= FNDELAY
;
501 * fill in the entries that falloc reserved
503 mutex_exit(&nfp
->f_tlock
);
507 * get the original flags before we release
509 VERIFY(f_getfd_error(svs
[0], &orig_flags
) == 0);
515 * If FD_CLOEXEC was set on the filedescriptor we're
516 * swapping out, we should set it on the new one too.
518 if (orig_flags
& FD_CLOEXEC
) {
519 f_setfd(nfd
, FD_CLOEXEC
);
523 * The socketpair library routine will close the original
524 * svs[0] when this code passes out a different file
529 if (copyout(svs
, sv
, sizeof (svs
))) {
530 (void) closeandsetf(nfd
, NULL
);
532 return (set_errno(EFAULT
));
540 return (set_errno(error
));
544 bind(int sock
, struct sockaddr
*name
, socklen_t namelen
)
549 dprint(1, ("bind(%d, %p, %d)\n",
550 sock
, (void *)name
, namelen
));
552 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
553 return (set_errno(error
));
555 /* Allocate and copyin name */
557 * X/Open test does not expect EFAULT with NULL name and non-zero
560 if (name
!= NULL
&& namelen
!= 0) {
561 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
562 name
= copyin_name(so
, name
, &namelen
, &error
);
565 return (set_errno(error
));
572 error
= socket_bind(so
, name
, namelen
, 0, CRED());
576 kmem_free(name
, (size_t)namelen
);
579 return (set_errno(error
));
584 listen(int sock
, int backlog
)
589 dprint(1, ("listen(%d, %d)\n",
592 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
593 return (set_errno(error
));
595 error
= socket_listen(so
, backlog
, CRED());
599 return (set_errno(error
));
604 accept(int sock
, struct sockaddr
*name
, socklen_t
*namelenp
, int flags
)
615 struct sockaddr
*addrp
;
618 dprint(1, ("accept(%d, %p, %p)\n",
619 sock
, (void *)name
, (void *)namelenp
));
621 if (flags
& ~(SOCK_CLOEXEC
|SOCK_NONBLOCK
|SOCK_NDELAY
)) {
622 return (set_errno(EINVAL
));
625 /* Translate SOCK_ flags to their SS_ variant */
627 if (flags
& SOCK_NONBLOCK
)
628 ssflags
|= SS_NONBLOCK
;
629 if (flags
& SOCK_NDELAY
)
630 ssflags
|= SS_NDELAY
;
632 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
633 return (set_errno(error
));
636 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
637 if (copyin(namelenp
, &namelen
, sizeof (namelen
))) {
639 return (set_errno(EFAULT
));
642 error
= useracc(name
, (size_t)namelen
, B_WRITE
);
643 if (error
&& do_useracc
) {
645 return (set_errno(EFAULT
));
654 * Allocate the user fd before socket_accept() in order to
655 * catch EMFILE errors before calling socket_accept().
657 if ((nfd
= ufalloc(0)) == -1) {
658 eprintsoline(so
, EMFILE
);
660 return (set_errno(EMFILE
));
662 error
= socket_accept(so
, fp
->f_flag
, CRED(), &nso
);
666 return (set_errno(error
));
671 ASSERT(MUTEX_NOT_HELD(&nso
->so_lock
));
673 addrlen
= so
->so_max_addr_len
;
674 addrp
= kmem_alloc(addrlen
, KM_SLEEP
);
676 if ((error
= socket_getpeername(nso
, (struct sockaddr
*)addrp
,
677 &addrlen
, B_TRUE
, CRED())) == 0) {
678 error
= copyout_name(name
, namelen
, namelenp
,
681 ASSERT(error
== EINVAL
|| error
== ENOTCONN
);
682 error
= ECONNABORTED
;
684 kmem_free(addrp
, so
->so_max_addr_len
);
689 (void) socket_close(nso
, 0, CRED());
692 return (set_errno(error
));
694 if (error
= falloc(NULL
, FWRITE
|FREAD
, &nfp
, NULL
)) {
696 (void) socket_close(nso
, 0, CRED());
698 eprintsoline(so
, error
);
700 return (set_errno(error
));
703 * fill in the entries that falloc reserved
706 mutex_exit(&nfp
->f_tlock
);
710 * Act on SOCK_CLOEXEC from flags
712 if (flags
& SOCK_CLOEXEC
) {
713 f_setfd(nfd
, FD_CLOEXEC
);
717 * Copy FNDELAY and FNONBLOCK from listener to acceptor
720 if ((ssflags
| so
->so_state
) & (SS_NDELAY
|SS_NONBLOCK
)) {
721 uint_t oflag
= nfp
->f_flag
;
724 if ((ssflags
| so
->so_state
) & SS_NONBLOCK
)
726 else if ((ssflags
| so
->so_state
) & SS_NDELAY
)
730 * This code is a simplification of the F_SETFL code in fcntl()
731 * Ignore any errors from fop_setfl.
733 if ((error
= fop_setfl(nvp
, oflag
, arg
, nfp
->f_cred
, NULL
))
735 eprintsoline(so
, error
);
738 mutex_enter(&nfp
->f_tlock
);
739 nfp
->f_flag
&= ~FCNTLFLAGS
;
741 mutex_exit(&nfp
->f_tlock
);
749 connect(int sock
, struct sockaddr
*name
, socklen_t namelen
)
755 dprint(1, ("connect(%d, %p, %d)\n",
756 sock
, (void *)name
, namelen
));
758 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
759 return (set_errno(error
));
761 /* Allocate and copyin name */
763 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
764 name
= copyin_name(so
, name
, &namelen
, &error
);
767 return (set_errno(error
));
772 error
= socket_connect(so
, name
, namelen
, fp
->f_flag
, 0, CRED());
775 kmem_free(name
, (size_t)namelen
);
777 return (set_errno(error
));
782 shutdown(int sock
, int how
)
787 dprint(1, ("shutdown(%d, %d)\n",
790 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
791 return (set_errno(error
));
793 error
= socket_shutdown(so
, how
, CRED());
797 return (set_errno(error
));
802 * Common receive routine.
810 socklen_t
*controllenp
,
818 socklen_t controllen
;
822 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
823 return (set_errno(error
));
825 len
= uiop
->uio_resid
;
826 uiop
->uio_fmode
= fp
->f_flag
;
827 uiop
->uio_extflg
= UIO_COPY_CACHED
;
829 name
= msg
->msg_name
;
830 namelen
= msg
->msg_namelen
;
831 control
= msg
->msg_control
;
832 controllen
= msg
->msg_controllen
;
834 msg
->msg_flags
= flags
& (MSG_OOB
| MSG_PEEK
| MSG_WAITALL
|
837 error
= socket_recvmsg(so
, msg
, uiop
, CRED());
840 return (set_errno(error
));
842 lwp_stat_update(LWP_STAT_MSGRCV
, 1);
845 error
= copyout_name(name
, namelen
, namelenp
,
846 msg
->msg_name
, msg
->msg_namelen
);
850 if (flagsp
!= NULL
) {
852 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
853 * when controllen is zero and there is control data to
856 if (controllen
!= 0 &&
857 (msg
->msg_controllen
> controllen
|| control
== NULL
)) {
858 dprint(1, ("recvit: CTRUNC %d %d %p\n",
859 msg
->msg_controllen
, controllen
, control
));
861 msg
->msg_flags
|= MSG_CTRUNC
;
863 if (copyout(&msg
->msg_flags
, flagsp
,
864 sizeof (msg
->msg_flags
))) {
870 * Note: This MUST be done last. There can be no "goto err" after this
871 * point since it could make so_closefds run twice on some part
872 * of the file descriptor array.
874 if (controllen
!= 0) {
875 error
= copyout_arg(control
, controllen
, controllenp
,
876 msg
->msg_control
, msg
->msg_controllen
);
880 if (msg
->msg_controllen
> controllen
|| control
== NULL
) {
883 so_closefds(msg
->msg_control
, msg
->msg_controllen
,
887 if (msg
->msg_namelen
!= 0)
888 kmem_free(msg
->msg_name
, (size_t)msg
->msg_namelen
);
889 if (msg
->msg_controllen
!= 0)
890 kmem_free(msg
->msg_control
, (size_t)msg
->msg_controllen
);
891 return (len
- uiop
->uio_resid
);
895 * If we fail and the control part contains file descriptors
896 * we have to close the fd's.
898 if (msg
->msg_controllen
!= 0)
899 so_closefds(msg
->msg_control
, msg
->msg_controllen
, 0);
900 if (msg
->msg_namelen
!= 0)
901 kmem_free(msg
->msg_name
, (size_t)msg
->msg_namelen
);
902 if (msg
->msg_controllen
!= 0)
903 kmem_free(msg
->msg_control
, (size_t)msg
->msg_controllen
);
904 return (set_errno(error
));
911 recv(int sock
, void *buffer
, size_t len
, int flags
)
915 struct iovec aiov
[1];
917 dprint(1, ("recv(%d, %p, %ld, %d)\n",
918 sock
, buffer
, len
, flags
));
920 if ((ssize_t
)len
< 0) {
921 return (set_errno(EINVAL
));
924 aiov
[0].iov_base
= buffer
;
925 aiov
[0].iov_len
= len
;
926 auio
.uio_loffset
= 0;
929 auio
.uio_resid
= len
;
930 auio
.uio_segflg
= UIO_USERSPACE
;
933 lmsg
.msg_namelen
= 0;
934 lmsg
.msg_controllen
= 0;
936 return (recvit(sock
, &lmsg
, &auio
, flags
, NULL
, NULL
, NULL
));
940 recvfrom(int sock
, void *buffer
, size_t len
, int flags
,
941 struct sockaddr
*name
, socklen_t
*namelenp
)
945 struct iovec aiov
[1];
947 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
948 sock
, buffer
, len
, flags
, (void *)name
, (void *)namelenp
));
950 if ((ssize_t
)len
< 0) {
951 return (set_errno(EINVAL
));
954 aiov
[0].iov_base
= buffer
;
955 aiov
[0].iov_len
= len
;
956 auio
.uio_loffset
= 0;
959 auio
.uio_resid
= len
;
960 auio
.uio_segflg
= UIO_USERSPACE
;
963 lmsg
.msg_name
= (char *)name
;
964 if (namelenp
!= NULL
) {
965 if (copyin(namelenp
, &lmsg
.msg_namelen
,
966 sizeof (lmsg
.msg_namelen
)))
967 return (set_errno(EFAULT
));
969 lmsg
.msg_namelen
= 0;
971 lmsg
.msg_controllen
= 0;
974 return (recvit(sock
, &lmsg
, &auio
, flags
, namelenp
, NULL
, NULL
));
978 recvmsg(int sock
, struct msghdr
*msg
, int flags
)
980 STRUCT_DECL(msghdr
, u_lmsg
);
981 STRUCT_HANDLE(msghdr
, umsgptr
);
984 struct iovec buf
[IOV_MAX_STACK
], *aiov
= buf
;
992 dprint(1, ("recvmsg(%d, %p, %d)\n",
993 sock
, (void *)msg
, flags
));
995 model
= get_udatamodel();
996 STRUCT_INIT(u_lmsg
, model
);
997 STRUCT_SET_HANDLE(umsgptr
, model
, msg
);
999 if (copyin(msg
, STRUCT_BUF(u_lmsg
), STRUCT_SIZE(u_lmsg
)))
1000 return (set_errno(EFAULT
));
1001 flagsp
= STRUCT_FADDR(umsgptr
, msg_flags
);
1004 * Code below us will kmem_alloc memory and hang it
1005 * off msg_control and msg_name fields. This forces
1006 * us to copy the structure to its native form.
1008 lmsg
.msg_name
= STRUCT_FGETP(u_lmsg
, msg_name
);
1009 lmsg
.msg_namelen
= STRUCT_FGET(u_lmsg
, msg_namelen
);
1010 lmsg
.msg_iov
= STRUCT_FGETP(u_lmsg
, msg_iov
);
1011 lmsg
.msg_iovlen
= STRUCT_FGET(u_lmsg
, msg_iovlen
);
1012 lmsg
.msg_control
= STRUCT_FGETP(u_lmsg
, msg_control
);
1013 lmsg
.msg_controllen
= STRUCT_FGET(u_lmsg
, msg_controllen
);
1014 lmsg
.msg_flags
= STRUCT_FGET(u_lmsg
, msg_flags
);
1016 iovcnt
= lmsg
.msg_iovlen
;
1018 if (iovcnt
< 0 || iovcnt
> IOV_MAX
)
1019 return (set_errno(EMSGSIZE
));
1021 if (iovcnt
> IOV_MAX_STACK
) {
1022 iovsize
= iovcnt
* sizeof (struct iovec
);
1023 aiov
= kmem_alloc(iovsize
, KM_SLEEP
);
1026 #ifdef _SYSCALL32_IMPL
1028 * 32-bit callers need to have their iovec expanded, while ensuring
1029 * that they can't move more than 2Gbytes of data in a single call.
1031 if (model
== DATAMODEL_ILP32
) {
1032 struct iovec32 buf32
[IOV_MAX_STACK
], *aiov32
= buf32
;
1036 iov32size
= iovcnt
* sizeof (struct iovec32
);
1038 aiov32
= kmem_alloc(iov32size
, KM_SLEEP
);
1040 if (copyin((struct iovec32
*)lmsg
.msg_iov
, aiov32
, iov32size
)) {
1042 kmem_free(aiov32
, iov32size
);
1043 kmem_free(aiov
, iovsize
);
1046 return (set_errno(EFAULT
));
1050 for (i
= 0; i
< iovcnt
; i
++) {
1053 iovlen32
= aiov32
[i
].iov_len
;
1054 count32
+= iovlen32
;
1055 if (iovlen32
< 0 || count32
< 0) {
1057 kmem_free(aiov32
, iov32size
);
1058 kmem_free(aiov
, iovsize
);
1061 return (set_errno(EINVAL
));
1064 aiov
[i
].iov_len
= iovlen32
;
1066 (caddr_t
)(uintptr_t)aiov32
[i
].iov_base
;
1070 kmem_free(aiov32
, iov32size
);
1072 #endif /* _SYSCALL32_IMPL */
1073 if (copyin(lmsg
.msg_iov
, aiov
, iovcnt
* sizeof (struct iovec
))) {
1075 kmem_free(aiov
, iovsize
);
1077 return (set_errno(EFAULT
));
1080 for (i
= 0; i
< iovcnt
; i
++) {
1081 ssize_t iovlen
= aiov
[i
].iov_len
;
1083 if (iovlen
< 0 || len
< 0) {
1085 kmem_free(aiov
, iovsize
);
1087 return (set_errno(EINVAL
));
1090 auio
.uio_loffset
= 0;
1091 auio
.uio_iov
= aiov
;
1092 auio
.uio_iovcnt
= iovcnt
;
1093 auio
.uio_resid
= len
;
1094 auio
.uio_segflg
= UIO_USERSPACE
;
1097 if (lmsg
.msg_control
!= NULL
&&
1099 useracc(lmsg
.msg_control
, lmsg
.msg_controllen
,
1102 kmem_free(aiov
, iovsize
);
1104 return (set_errno(EFAULT
));
1107 rval
= recvit(sock
, &lmsg
, &auio
, flags
,
1108 STRUCT_FADDR(umsgptr
, msg_namelen
),
1109 STRUCT_FADDR(umsgptr
, msg_controllen
), flagsp
);
1112 kmem_free(aiov
, iovsize
);
1118 * Common send function.
1121 sendit(int sock
, struct msghdr
*msg
, struct uio
*uiop
, int flags
)
1128 socklen_t controllen
;
1132 if ((so
= getsonode(sock
, &error
, &fp
)) == NULL
)
1133 return (set_errno(error
));
1135 uiop
->uio_fmode
= fp
->f_flag
;
1137 if (so
->so_family
== AF_UNIX
)
1138 uiop
->uio_extflg
= UIO_COPY_CACHED
;
1140 uiop
->uio_extflg
= UIO_COPY_DEFAULT
;
1142 /* Allocate and copyin name and control */
1143 name
= msg
->msg_name
;
1144 namelen
= msg
->msg_namelen
;
1145 if (name
!= NULL
&& namelen
!= 0) {
1146 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1147 name
= copyin_name(so
,
1148 (struct sockaddr
*)name
,
1152 /* copyin_name null terminates addresses for AF_UNIX */
1153 msg
->msg_namelen
= namelen
;
1154 msg
->msg_name
= name
;
1156 msg
->msg_name
= name
= NULL
;
1157 msg
->msg_namelen
= namelen
= 0;
1160 control
= msg
->msg_control
;
1161 controllen
= msg
->msg_controllen
;
1162 if ((control
!= NULL
) && (controllen
!= 0)) {
1164 * Verify that the length is not excessive to prevent
1165 * an application from consuming all of kernel memory.
1167 if (controllen
> SO_MAXARGSIZE
) {
1171 control
= kmem_alloc(controllen
, KM_SLEEP
);
1173 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1174 if (copyin(msg
->msg_control
, control
, controllen
)) {
1178 msg
->msg_control
= control
;
1180 msg
->msg_control
= control
= NULL
;
1181 msg
->msg_controllen
= controllen
= 0;
1184 len
= uiop
->uio_resid
;
1185 msg
->msg_flags
= flags
;
1187 error
= socket_sendmsg(so
, msg
, uiop
, CRED());
1189 if (control
!= NULL
)
1190 kmem_free(control
, controllen
);
1193 kmem_free(name
, namelen
);
1197 return (set_errno(error
));
1199 lwp_stat_update(LWP_STAT_MSGSND
, 1);
1201 return (len
- uiop
->uio_resid
);
1205 * Native system call
1208 send(int sock
, void *buffer
, size_t len
, int flags
)
1212 struct iovec aiov
[1];
1214 dprint(1, ("send(%d, %p, %ld, %d)\n",
1215 sock
, buffer
, len
, flags
));
1217 if ((ssize_t
)len
< 0) {
1218 return (set_errno(EINVAL
));
1221 aiov
[0].iov_base
= buffer
;
1222 aiov
[0].iov_len
= len
;
1223 auio
.uio_loffset
= 0;
1224 auio
.uio_iov
= aiov
;
1225 auio
.uio_iovcnt
= 1;
1226 auio
.uio_resid
= len
;
1227 auio
.uio_segflg
= UIO_USERSPACE
;
1230 lmsg
.msg_name
= NULL
;
1231 lmsg
.msg_control
= NULL
;
1232 return (sendit(sock
, &lmsg
, &auio
, flags
));
1236 sendmsg(int sock
, struct msghdr
*msg
, int flags
)
1239 STRUCT_DECL(msghdr
, u_lmsg
);
1241 struct iovec buf
[IOV_MAX_STACK
], *aiov
= buf
;
1242 ssize_t iovsize
= 0;
1248 dprint(1, ("sendmsg(%d, %p, %d)\n", sock
, (void *)msg
, flags
));
1250 model
= get_udatamodel();
1251 STRUCT_INIT(u_lmsg
, model
);
1253 if (copyin(msg
, (char *)STRUCT_BUF(u_lmsg
),
1254 STRUCT_SIZE(u_lmsg
)))
1255 return (set_errno(EFAULT
));
1257 * Code below us will kmem_alloc memory and hang it
1258 * off msg_control and msg_name fields. This forces
1259 * us to copy the structure to its native form.
1261 lmsg
.msg_name
= STRUCT_FGETP(u_lmsg
, msg_name
);
1262 lmsg
.msg_namelen
= STRUCT_FGET(u_lmsg
, msg_namelen
);
1263 lmsg
.msg_iov
= STRUCT_FGETP(u_lmsg
, msg_iov
);
1264 lmsg
.msg_iovlen
= STRUCT_FGET(u_lmsg
, msg_iovlen
);
1265 lmsg
.msg_control
= STRUCT_FGETP(u_lmsg
, msg_control
);
1266 lmsg
.msg_controllen
= STRUCT_FGET(u_lmsg
, msg_controllen
);
1267 lmsg
.msg_flags
= STRUCT_FGET(u_lmsg
, msg_flags
);
1269 iovcnt
= lmsg
.msg_iovlen
;
1271 if (iovcnt
< 0 || iovcnt
> IOV_MAX
)
1272 return (set_errno(EMSGSIZE
));
1274 if (iovcnt
> IOV_MAX_STACK
) {
1275 iovsize
= iovcnt
* sizeof (struct iovec
);
1276 aiov
= kmem_alloc(iovsize
, KM_SLEEP
);
1279 #ifdef _SYSCALL32_IMPL
1281 * 32-bit callers need to have their iovec expanded, while ensuring
1282 * that they can't move more than 2Gbytes of data in a single call.
1284 if (model
== DATAMODEL_ILP32
) {
1285 struct iovec32 buf32
[IOV_MAX_STACK
], *aiov32
= buf32
;
1289 iov32size
= iovcnt
* sizeof (struct iovec32
);
1291 aiov32
= kmem_alloc(iov32size
, KM_SLEEP
);
1294 copyin((struct iovec32
*)lmsg
.msg_iov
, aiov32
, iov32size
)) {
1296 kmem_free(aiov32
, iov32size
);
1297 kmem_free(aiov
, iovsize
);
1300 return (set_errno(EFAULT
));
1304 for (i
= 0; i
< iovcnt
; i
++) {
1307 iovlen32
= aiov32
[i
].iov_len
;
1308 count32
+= iovlen32
;
1309 if (iovlen32
< 0 || count32
< 0) {
1311 kmem_free(aiov32
, iov32size
);
1312 kmem_free(aiov
, iovsize
);
1315 return (set_errno(EINVAL
));
1318 aiov
[i
].iov_len
= iovlen32
;
1320 (caddr_t
)(uintptr_t)aiov32
[i
].iov_base
;
1324 kmem_free(aiov32
, iov32size
);
1326 #endif /* _SYSCALL32_IMPL */
1328 copyin(lmsg
.msg_iov
, aiov
,
1329 (unsigned)iovcnt
* sizeof (struct iovec
))) {
1331 kmem_free(aiov
, iovsize
);
1333 return (set_errno(EFAULT
));
1336 for (i
= 0; i
< iovcnt
; i
++) {
1337 ssize_t iovlen
= aiov
[i
].iov_len
;
1339 if (iovlen
< 0 || len
< 0) {
1341 kmem_free(aiov
, iovsize
);
1343 return (set_errno(EINVAL
));
1346 auio
.uio_loffset
= 0;
1347 auio
.uio_iov
= aiov
;
1348 auio
.uio_iovcnt
= iovcnt
;
1349 auio
.uio_resid
= len
;
1350 auio
.uio_segflg
= UIO_USERSPACE
;
1353 rval
= sendit(sock
, &lmsg
, &auio
, flags
);
1356 kmem_free(aiov
, iovsize
);
1362 sendto(int sock
, void *buffer
, size_t len
, int flags
,
1363 struct sockaddr
*name
, socklen_t namelen
)
1367 struct iovec aiov
[1];
1369 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1370 sock
, buffer
, len
, flags
, (void *)name
, namelen
));
1372 if ((ssize_t
)len
< 0) {
1373 return (set_errno(EINVAL
));
1376 aiov
[0].iov_base
= buffer
;
1377 aiov
[0].iov_len
= len
;
1378 auio
.uio_loffset
= 0;
1379 auio
.uio_iov
= aiov
;
1380 auio
.uio_iovcnt
= 1;
1381 auio
.uio_resid
= len
;
1382 auio
.uio_segflg
= UIO_USERSPACE
;
1385 lmsg
.msg_name
= (char *)name
;
1386 lmsg
.msg_namelen
= namelen
;
1387 lmsg
.msg_control
= NULL
;
1388 return (sendit(sock
, &lmsg
, &auio
, flags
));
1392 getpeername(int sock
, struct sockaddr
*name
, socklen_t
*namelenp
)
1397 socklen_t sock_addrlen
;
1398 struct sockaddr
*sock_addrp
;
1400 dprint(1, ("getpeername(%d, %p, %p)\n",
1401 sock
, (void *)name
, (void *)namelenp
));
1403 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1406 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1407 if (copyin(namelenp
, &namelen
, sizeof (namelen
)) ||
1408 (name
== NULL
&& namelen
!= 0)) {
1412 sock_addrlen
= so
->so_max_addr_len
;
1413 sock_addrp
= kmem_alloc(sock_addrlen
, KM_SLEEP
);
1415 if ((error
= socket_getpeername(so
, sock_addrp
, &sock_addrlen
,
1416 B_FALSE
, CRED())) == 0) {
1417 ASSERT(sock_addrlen
<= so
->so_max_addr_len
);
1418 error
= copyout_name(name
, namelen
, namelenp
,
1419 (void *)sock_addrp
, sock_addrlen
);
1421 kmem_free(sock_addrp
, so
->so_max_addr_len
);
1424 bad
: return (error
!= 0 ? set_errno(error
) : 0);
1428 getsockname(int sock
, struct sockaddr
*name
,
1429 socklen_t
*namelenp
)
1433 socklen_t namelen
, sock_addrlen
;
1434 struct sockaddr
*sock_addrp
;
1436 dprint(1, ("getsockname(%d, %p, %p)\n",
1437 sock
, (void *)name
, (void *)namelenp
));
1439 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1442 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1443 if (copyin(namelenp
, &namelen
, sizeof (namelen
)) ||
1444 (name
== NULL
&& namelen
!= 0)) {
1449 sock_addrlen
= so
->so_max_addr_len
;
1450 sock_addrp
= kmem_alloc(sock_addrlen
, KM_SLEEP
);
1451 if ((error
= socket_getsockname(so
, sock_addrp
, &sock_addrlen
,
1453 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1454 ASSERT(sock_addrlen
<= so
->so_max_addr_len
);
1455 error
= copyout_name(name
, namelen
, namelenp
,
1456 (void *)sock_addrp
, sock_addrlen
);
1458 kmem_free(sock_addrp
, so
->so_max_addr_len
);
1461 bad
: return (error
!= 0 ? set_errno(error
) : 0);
1465 getsockopt(int sock
,
1469 socklen_t
*option_lenp
)
1472 socklen_t optlen
, optlen_res
;
1476 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1477 sock
, level
, option_name
, option_value
, (void *)option_lenp
));
1479 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1480 return (set_errno(error
));
1482 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1483 if (copyin(option_lenp
, &optlen
, sizeof (optlen
))) {
1485 return (set_errno(EFAULT
));
1488 * Verify that the length is not excessive to prevent
1489 * an application from consuming all of kernel memory.
1491 if (optlen
> SO_MAXARGSIZE
) {
1494 return (set_errno(error
));
1496 optval
= kmem_alloc(optlen
, KM_SLEEP
);
1497 optlen_res
= optlen
;
1498 error
= socket_getsockopt(so
, level
, option_name
, optval
,
1499 &optlen_res
, 0, CRED());
1502 kmem_free(optval
, optlen
);
1503 return (set_errno(error
));
1505 error
= copyout_arg(option_value
, optlen
, option_lenp
,
1506 optval
, optlen_res
);
1507 kmem_free(optval
, optlen
);
1509 return (set_errno(error
));
1514 setsockopt(int sock
,
1518 socklen_t option_len
)
1522 void *optval
= NULL
;
1525 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1526 sock
, level
, option_name
, option_value
, option_len
));
1528 if ((so
= getsonode(sock
, &error
, NULL
)) == NULL
)
1529 return (set_errno(error
));
1531 if (option_value
!= NULL
) {
1532 if (option_len
!= 0) {
1534 * Verify that the length is not excessive to prevent
1535 * an application from consuming all of kernel memory.
1537 if (option_len
> SO_MAXARGSIZE
) {
1541 optval
= option_len
<= sizeof (buffer
) ?
1542 &buffer
: kmem_alloc((size_t)option_len
, KM_SLEEP
);
1543 ASSERT(MUTEX_NOT_HELD(&so
->so_lock
));
1544 if (copyin(option_value
, optval
, (size_t)option_len
)) {
1552 error
= socket_setsockopt(so
, level
, option_name
, optval
,
1553 (t_uscalar_t
)option_len
, CRED());
1555 if (optval
!= buffer
)
1556 kmem_free(optval
, (size_t)option_len
);
1560 return (set_errno(error
));
1565 sockconf_add_sock(int family
, int type
, int protocol
, char *name
)
1568 char *kdevpath
= NULL
;
1569 char *kmodule
= NULL
;
1572 struct sockparams
*sp
;
1578 * This also makes it possible to check for too long pathnames.
1579 * Compress the space needed for the name before passing it
1580 * to soconfig - soconfig will store the string until
1581 * the configuration is removed.
1583 buf
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
1584 if ((error
= copyinstr(name
, buf
, MAXPATHLEN
, &pathlen
)) != 0) {
1585 kmem_free(buf
, MAXPATHLEN
);
1588 if (strncmp(buf
, "/dev", strlen("/dev")) == 0) {
1590 kdevpath
= kmem_alloc(pathlen
, KM_SLEEP
);
1591 bcopy(buf
, kdevpath
, pathlen
);
1592 kdevpath
[pathlen
- 1] = '\0';
1594 /* For socket module */
1595 kmodule
= kmem_alloc(pathlen
, KM_SLEEP
);
1596 bcopy(buf
, kmodule
, pathlen
);
1597 kmodule
[pathlen
- 1] = '\0';
1600 kmem_free(buf
, MAXPATHLEN
);
1602 /* sockparams_create frees mod name and devpath upon failure */
1603 sp
= sockparams_create(family
, type
, protocol
, kmodule
,
1604 kdevpath
, pathlen
, 0, KM_SLEEP
, &error
);
1606 error
= sockparams_add(sp
);
1608 sockparams_destroy(sp
);
1615 sockconf_remove_sock(int family
, int type
, int protocol
)
1617 return (sockparams_delete(family
, type
, protocol
));
1621 sockconfig_remove_filter(const char *uname
)
1623 char kname
[SOF_MAXNAMELEN
];
1628 if ((error
= copyinstr(uname
, kname
, SOF_MAXNAMELEN
, &len
)) != 0)
1631 ent
= sof_entry_remove_by_name(kname
);
1635 mutex_enter(&ent
->sofe_lock
);
1636 ASSERT(!(ent
->sofe_flags
& SOFEF_CONDEMED
));
1637 if (ent
->sofe_refcnt
== 0) {
1638 mutex_exit(&ent
->sofe_lock
);
1639 sof_entry_free(ent
);
1641 /* let the last socket free the filter */
1642 ent
->sofe_flags
|= SOFEF_CONDEMED
;
1643 mutex_exit(&ent
->sofe_lock
);
1650 sockconfig_add_filter(const char *uname
, void *ufilpropp
)
1652 struct sockconfig_filter_props filprop
;
1655 size_t tuplesz
, len
;
1656 char hintbuf
[SOF_MAXNAMELEN
];
1658 ent
= kmem_zalloc(sizeof (sof_entry_t
), KM_SLEEP
);
1659 mutex_init(&ent
->sofe_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1661 if ((error
= copyinstr(uname
, ent
->sofe_name
, SOF_MAXNAMELEN
,
1663 sof_entry_free(ent
);
1667 if (get_udatamodel() == DATAMODEL_NATIVE
) {
1668 if (copyin(ufilpropp
, &filprop
, sizeof (filprop
)) != 0) {
1669 sof_entry_free(ent
);
1673 #ifdef _SYSCALL32_IMPL
1675 struct sockconfig_filter_props32 filprop32
;
1677 if (copyin(ufilpropp
, &filprop32
, sizeof (filprop32
)) != 0) {
1678 sof_entry_free(ent
);
1681 filprop
.sfp_modname
= (char *)(uintptr_t)filprop32
.sfp_modname
;
1682 filprop
.sfp_autoattach
= filprop32
.sfp_autoattach
;
1683 filprop
.sfp_hint
= filprop32
.sfp_hint
;
1684 filprop
.sfp_hintarg
= (char *)(uintptr_t)filprop32
.sfp_hintarg
;
1685 filprop
.sfp_socktuple_cnt
= filprop32
.sfp_socktuple_cnt
;
1686 filprop
.sfp_socktuple
=
1687 (sof_socktuple_t
*)(uintptr_t)filprop32
.sfp_socktuple
;
1689 #endif /* _SYSCALL32_IMPL */
1691 if ((error
= copyinstr(filprop
.sfp_modname
, ent
->sofe_modname
,
1692 sizeof (ent
->sofe_modname
), &len
)) != 0) {
1693 sof_entry_free(ent
);
1698 * A filter must specify at least one socket tuple.
1700 if (filprop
.sfp_socktuple_cnt
== 0 ||
1701 filprop
.sfp_socktuple_cnt
> SOF_MAXSOCKTUPLECNT
) {
1702 sof_entry_free(ent
);
1705 ent
->sofe_flags
= filprop
.sfp_autoattach
? SOFEF_AUTO
: SOFEF_PROG
;
1706 ent
->sofe_hint
= filprop
.sfp_hint
;
1709 * Verify the hint, and copy in the hint argument, if necessary.
1711 switch (ent
->sofe_hint
) {
1712 case SOF_HINT_BEFORE
:
1713 case SOF_HINT_AFTER
:
1714 if ((error
= copyinstr(filprop
.sfp_hintarg
, hintbuf
,
1715 sizeof (hintbuf
), &len
)) != 0) {
1716 sof_entry_free(ent
);
1719 ent
->sofe_hintarg
= kmem_alloc(len
, KM_SLEEP
);
1720 bcopy(hintbuf
, ent
->sofe_hintarg
, len
);
1723 case SOF_HINT_BOTTOM
:
1724 /* hints cannot be used with programmatic filters */
1725 if (ent
->sofe_flags
& SOFEF_PROG
) {
1726 sof_entry_free(ent
);
1733 /* bad hint value */
1734 sof_entry_free(ent
);
1738 ent
->sofe_socktuple_cnt
= filprop
.sfp_socktuple_cnt
;
1739 tuplesz
= sizeof (sof_socktuple_t
) * ent
->sofe_socktuple_cnt
;
1740 ent
->sofe_socktuple
= kmem_alloc(tuplesz
, KM_SLEEP
);
1742 if (get_udatamodel() == DATAMODEL_NATIVE
) {
1743 if (copyin(filprop
.sfp_socktuple
, ent
->sofe_socktuple
,
1745 sof_entry_free(ent
);
1749 #ifdef _SYSCALL32_IMPL
1752 caddr_t data
= (caddr_t
)filprop
.sfp_socktuple
;
1753 sof_socktuple_t
*tup
= ent
->sofe_socktuple
;
1754 sof_socktuple32_t tup32
;
1756 tup
= ent
->sofe_socktuple
;
1757 for (i
= 0; i
< ent
->sofe_socktuple_cnt
; i
++, tup
++) {
1758 ASSERT(tup
< ent
->sofe_socktuple
+ tuplesz
);
1760 if (copyin(data
, &tup32
, sizeof (tup32
)) != 0) {
1761 sof_entry_free(ent
);
1764 tup
->sofst_family
= tup32
.sofst_family
;
1765 tup
->sofst_type
= tup32
.sofst_type
;
1766 tup
->sofst_protocol
= tup32
.sofst_protocol
;
1768 data
+= sizeof (tup32
);
1771 #endif /* _SYSCALL32_IMPL */
1773 /* Sockets can start using the filter as soon as the filter is added */
1774 if ((error
= sof_entry_add(ent
)) != 0)
1775 sof_entry_free(ent
);
1781 * Socket configuration system call. It is used to add and remove
1785 sockconfig(int cmd
, void *arg1
, void *arg2
, void *arg3
, void *arg4
)
1789 if (secpolicy_net_config(CRED(), B_FALSE
) != 0)
1790 return (set_errno(EPERM
));
1793 case SOCKCONFIG_ADD_SOCK
:
1794 error
= sockconf_add_sock((int)(uintptr_t)arg1
,
1795 (int)(uintptr_t)arg2
, (int)(uintptr_t)arg3
, arg4
);
1797 case SOCKCONFIG_REMOVE_SOCK
:
1798 error
= sockconf_remove_sock((int)(uintptr_t)arg1
,
1799 (int)(uintptr_t)arg2
, (int)(uintptr_t)arg3
);
1801 case SOCKCONFIG_ADD_FILTER
:
1802 error
= sockconfig_add_filter((const char *)arg1
, arg2
);
1804 case SOCKCONFIG_REMOVE_FILTER
:
1805 error
= sockconfig_remove_filter((const char *)arg1
);
1807 case SOCKCONFIG_GET_SOCKTABLE
:
1808 error
= sockparams_copyout_socktable((int)(uintptr_t)arg1
);
1812 cmn_err(CE_NOTE
, "sockconfig: unkonwn subcommand %d", cmd
);
1820 return (set_errno(error
));
1827 * Sendfile is implemented through two schemes, direct I/O or by
1828 * caching in the filesystem page cache. We cache the input file by
1829 * default and use direct I/O only if sendfile_max_size is set
1830 * appropriately as explained below. Note that this logic is consistent
1831 * with other filesystems where caching is turned on by default
1832 * unless explicitly turned off by using the DIRECTIO ioctl.
1834 * We choose a slightly different scheme here. One can turn off
1835 * caching by setting sendfile_max_size to 0. One can also enable
1836 * caching of files <= sendfile_max_size by setting sendfile_max_size
1837 * to an appropriate value. By default sendfile_max_size is set to the
1838 * maximum value so that all files are cached. In future, we may provide
1839 * better interfaces for caching the file.
1841 * Sendfile through Direct I/O (Zero copy)
1842 * --------------------------------------
1844 * As disks are normally slower than the network, we can't have a
1845 * single thread that reads the disk and writes to the network. We
1846 * need to have parallelism. This is done by having the sendfile
1847 * thread create another thread that reads from the filesystem
1848 * and queues it for network processing. In this scheme, the data
1849 * is never copied anywhere i.e it is zero copy unlike the other
1852 * We have a sendfile queue (snfq) where each sendfile
1853 * request (snf_req_t) is queued for processing by a thread. Number
1854 * of threads is dynamically allocated and they exit if they are idling
1855 * beyond a specified amount of time. When each request (snf_req_t) is
1856 * processed by a thread, it produces a number of mblk_t structures to
1857 * be consumed by the sendfile thread. snf_deque and snf_enque are
1858 * used for consuming and producing mblks. Size of the filesystem
1859 * read is determined by the tunable (sendfile_read_size). A single
1860 * mblk holds sendfile_read_size worth of data (except the last
1861 * read of the file) which is sent down as a whole to the network.
1862 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1863 * value for the UFS filesystem backed by a striped storage array.
1865 * Synchronisation between read (producer) and write (consumer) threads.
1866 * --------------------------------------------------------------------
1868 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1869 * adding and deleting items in this list. Error can happen anytime
1870 * during read or write. There could be unprocessed mblks in the
1871 * sr_ib_XXX list when a read or write error occurs. Whenever error
1872 * is encountered, we need two things to happen :
1874 * a) One of the threads need to clean the mblks.
1875 * b) When one thread encounters an error, the other should stop.
1877 * For (a), we don't want to penalize the reader thread as it could do
1878 * some useful work processing other requests. For (b), the error can
1879 * be detected by examining sr_read_error or sr_write_error.
1880 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1881 * writer encounters error, we need to report the write error back to
1882 * the application as that's what would have happened if the operations
1883 * were done sequentially. With this in mind, following should work :
1885 * - Check for errors before read or write.
1886 * - If the reader encounters error, set the error in sr_read_error.
1887 * Check sr_write_error, if it is set, send cv_signal as it is
1888 * waiting for reader to complete. If it is not set, the writer
1889 * is either running sinking data to the network or blocked
1890 * because of flow control. For handling the latter case, we
1891 * always send a signal. In any case, it will examine sr_read_error
1892 * and return. sr_read_error is marked with SR_READ_DONE to tell
1893 * the writer that the reader is done in all the cases.
1894 * - If the writer encounters error, set the error in sr_write_error.
1895 * The reader thread is either blocked because of flow control or
1896 * running reading data from the disk. For the former, we need to
1897 * wakeup the thread. Again to keep it simple, we always wake up
1898 * the reader thread. Then, wait for the read thread to complete
1899 * if it is not done yet. Cleanup and return.
1901 * High and low water marks for the read thread.
1902 * --------------------------------------------
1904 * If sendfile() is used to send data over a slow network, we need to
1905 * make sure that the read thread does not produce data at a faster
1906 * rate than the network. This can happen if the disk is faster than
1907 * the network. In such a case, we don't want to build a very large queue.
1908 * But we would still like to get all of the network throughput possible.
1909 * This implies that network should never block waiting for data.
1910 * As there are lot of disk throughput/network throughput combinations
1911 * possible, it is difficult to come up with an accurate number.
1912 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1913 * latency of 3ms for reading a disk block. Thus, the total latency to
1914 * initiate a new read, transfer data from the disk and queue for
1915 * transmission would take about a max of 25ms. Todays max transfer rate
1916 * for network is 100MB/sec. If the thread is blocked because of flow
1917 * control, it would take 25ms to get new data ready for transmission.
1918 * We have to make sure that network is not idling, while we are initiating
1919 * new transfers. So, at 100MB/sec, to keep network busy we would need
1920 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1921 * We need to pick a high water mark so that the woken up thread would
1922 * do considerable work before blocking again to prevent thrashing. Currently,
1923 * we pick this to be 10 times that of the low water mark.
1925 * Sendfile with segmap caching (One copy from page cache to mblks).
1926 * ----------------------------------------------------------------
1928 * We use the segmap cache for caching the file, if the size of file
1929 * is <= sendfile_max_size. In this case we don't use threads as VM
1930 * is reasonably fast enough to keep up with the network. If the underlying
1931 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1932 * of data into segmap space, and use the virtual address from segmap
1933 * directly through desballoc() to avoid copy. Once the transport is done
1934 * with the data, the mapping will be released through segmap_release()
1935 * called by the call-back routine.
1937 * If zero-copy is not allowed by the transport, we simply call fop_read()
1938 * to copy the data from the filesystem into our temporary network buffer.
1940 * To disable caching, set sendfile_max_size to 0.
1943 uint_t sendfile_read_size
= 1024 * 1024;
1944 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024
1945 uint_t sendfile_req_lowat
= SENDFILE_REQ_LOWAT
;
1946 uint_t sendfile_req_hiwat
= 10 * SENDFILE_REQ_LOWAT
;
1947 struct sendfile_stats sf_stats
;
1948 struct sendfile_queue
*snfq
;
1949 clock_t snfq_timeout
;
1950 off64_t sendfile_max_size
;
1952 static void snf_enque(snf_req_t
*, mblk_t
*);
1953 static mblk_t
*snf_deque(snf_req_t
*);
1958 snfq
= kmem_zalloc(sizeof (struct sendfile_queue
), KM_SLEEP
);
1960 mutex_init(&snfq
->snfq_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1961 cv_init(&snfq
->snfq_cv
, NULL
, CV_DEFAULT
, NULL
);
1962 snfq
->snfq_max_threads
= max_ncpus
;
1963 snfq_timeout
= SNFQ_TIMEOUT
;
1964 /* Cache all files by default. */
1965 sendfile_max_size
= MAXOFFSET_T
;
1969 * Queues a mblk_t for network processing.
1972 snf_enque(snf_req_t
*sr
, mblk_t
*mp
)
1975 mutex_enter(&sr
->sr_lock
);
1976 if (sr
->sr_mp_head
== NULL
) {
1977 sr
->sr_mp_head
= sr
->sr_mp_tail
= mp
;
1978 cv_signal(&sr
->sr_cv
);
1980 sr
->sr_mp_tail
->b_next
= mp
;
1981 sr
->sr_mp_tail
= mp
;
1983 sr
->sr_qlen
+= MBLKL(mp
);
1984 while ((sr
->sr_qlen
> sr
->sr_hiwat
) &&
1985 (sr
->sr_write_error
== 0)) {
1986 sf_stats
.ss_full_waits
++;
1987 cv_wait(&sr
->sr_cv
, &sr
->sr_lock
);
1989 mutex_exit(&sr
->sr_lock
);
1993 * De-queues a mblk_t for network processing.
1996 snf_deque(snf_req_t
*sr
)
2000 mutex_enter(&sr
->sr_lock
);
2002 * If we have encountered an error on read or read is
2003 * completed and no more mblks, return NULL.
2004 * We need to check for NULL sr_mp_head also as
2005 * the reads could have completed and there is
2006 * nothing more to come.
2008 if (((sr
->sr_read_error
& ~SR_READ_DONE
) != 0) ||
2009 ((sr
->sr_read_error
& SR_READ_DONE
) &&
2010 sr
->sr_mp_head
== NULL
)) {
2011 mutex_exit(&sr
->sr_lock
);
2015 * To start with neither SR_READ_DONE is marked nor
2016 * the error is set. When we wake up from cv_wait,
2017 * following are the possibilities :
2019 * a) sr_read_error is zero and mblks are queued.
2020 * b) sr_read_error is set to SR_READ_DONE
2021 * and mblks are queued.
2022 * c) sr_read_error is set to SR_READ_DONE
2024 * d) sr_read_error is set to some error other
2025 * than SR_READ_DONE.
2028 while ((sr
->sr_read_error
== 0) && (sr
->sr_mp_head
== NULL
)) {
2029 sf_stats
.ss_empty_waits
++;
2030 cv_wait(&sr
->sr_cv
, &sr
->sr_lock
);
2032 /* Handle (a) and (b) first - the normal case. */
2033 if (((sr
->sr_read_error
& ~SR_READ_DONE
) == 0) &&
2034 (sr
->sr_mp_head
!= NULL
)) {
2035 mp
= sr
->sr_mp_head
;
2036 sr
->sr_mp_head
= mp
->b_next
;
2037 sr
->sr_qlen
-= MBLKL(mp
);
2038 if (sr
->sr_qlen
< sr
->sr_lowat
)
2039 cv_signal(&sr
->sr_cv
);
2040 mutex_exit(&sr
->sr_lock
);
2044 /* Handle (c) and (d). */
2045 mutex_exit(&sr
->sr_lock
);
2050 * Reads data from the filesystem and queues it for network processing.
2053 snf_async_read(snf_req_t
*sr
)
2069 size
= sr
->sr_file_size
;
2070 fileoff
= sr
->sr_file_off
;
2073 * Ignore the error for filesystems that doesn't support DIRECTIO.
2075 (void) fop_ioctl(fp
->f_vnode
, _FIODIRECTIO
, DIRECTIO_ON
, 0,
2079 if (vp
->v_type
== VSOCK
) {
2083 * Get the extra space to insert a header and a trailer.
2088 wroff
= so
->so_proto_props
.sopp_wroff
;
2089 maxblk
= so
->so_proto_props
.sopp_maxblk
;
2090 extra
= wroff
+ so
->so_proto_props
.sopp_tail
;
2092 wroff
= (int)(stp
->sd_wroff
);
2093 maxblk
= (int)(stp
->sd_maxblk
);
2094 extra
= wroff
+ (int)(stp
->sd_tail
);
2098 while ((size
!= 0) && (sr
->sr_write_error
== 0)) {
2100 iosize
= (int)MIN(sr
->sr_maxpsz
, size
);
2103 * Socket filters can limit the mblk size,
2104 * so limit reads to maxblk if there are
2107 if (vp
->v_type
== VSOCK
&&
2108 so
->so_filter_active
> 0 && maxblk
!= INFPSZ
)
2109 iosize
= (int)MIN(iosize
, maxblk
);
2111 mp
= allocb(iosize
+ extra
, BPRI_MED
);
2117 mp
->b_rptr
+= wroff
;
2119 ret_size
= soreadfile(fp
, mp
->b_rptr
, fileoff
, &error
, iosize
);
2121 /* Error or Reached EOF ? */
2122 if ((error
!= 0) || (ret_size
== 0)) {
2126 mp
->b_wptr
= mp
->b_rptr
+ ret_size
;
2130 fileoff
+= ret_size
;
2132 (void) fop_ioctl(fp
->f_vnode
, _FIODIRECTIO
, DIRECTIO_OFF
, 0,
2134 mutex_enter(&sr
->sr_lock
);
2135 sr
->sr_read_error
= error
;
2136 sr
->sr_read_error
|= SR_READ_DONE
;
2137 cv_signal(&sr
->sr_cv
);
2138 mutex_exit(&sr
->sr_lock
);
2142 snf_async_thread(void)
2145 callb_cpr_t cprinfo
;
2146 clock_t time_left
= 1;
2148 CALLB_CPR_INIT(&cprinfo
, &snfq
->snfq_lock
, callb_generic_cpr
, "snfq");
2150 mutex_enter(&snfq
->snfq_lock
);
2153 * If we didn't find a entry, then block until woken up
2154 * again and then look through the queues again.
2156 while ((sr
= snfq
->snfq_req_head
) == NULL
) {
2157 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
2158 if (time_left
<= 0) {
2159 snfq
->snfq_svc_threads
--;
2160 CALLB_CPR_EXIT(&cprinfo
);
2164 snfq
->snfq_idle_cnt
++;
2166 time_left
= cv_reltimedwait(&snfq
->snfq_cv
,
2167 &snfq
->snfq_lock
, snfq_timeout
, TR_CLOCK_TICK
);
2168 snfq
->snfq_idle_cnt
--;
2170 CALLB_CPR_SAFE_END(&cprinfo
, &snfq
->snfq_lock
);
2172 snfq
->snfq_req_head
= sr
->sr_next
;
2173 snfq
->snfq_req_cnt
--;
2174 mutex_exit(&snfq
->snfq_lock
);
2176 mutex_enter(&snfq
->snfq_lock
);
2182 create_thread(int operation
, struct vnode
*vp
, file_t
*fp
,
2183 uoff_t fileoff
, uoff_t size
)
2188 sr
= (snf_req_t
*)kmem_zalloc(sizeof (snf_req_t
), KM_SLEEP
);
2195 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2196 * stream might be closed before thread returns from snf_async_read.
2198 if (stp
!= NULL
&& stp
->sd_qn_maxpsz
> 0) {
2199 sr
->sr_maxpsz
= MIN(MAXBSIZE
, stp
->sd_qn_maxpsz
);
2201 sr
->sr_maxpsz
= MAXBSIZE
;
2204 sr
->sr_operation
= operation
;
2205 sr
->sr_file_off
= fileoff
;
2206 sr
->sr_file_size
= size
;
2207 sr
->sr_hiwat
= sendfile_req_hiwat
;
2208 sr
->sr_lowat
= sendfile_req_lowat
;
2209 mutex_init(&sr
->sr_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
2210 cv_init(&sr
->sr_cv
, NULL
, CV_DEFAULT
, NULL
);
2212 * See whether we need another thread for servicing this
2213 * request. If there are already enough requests queued
2214 * for the threads, create one if not exceeding
2217 mutex_enter(&snfq
->snfq_lock
);
2218 if (snfq
->snfq_req_cnt
>= snfq
->snfq_idle_cnt
&&
2219 snfq
->snfq_svc_threads
< snfq
->snfq_max_threads
) {
2220 (void) thread_create(NULL
, 0, &snf_async_thread
, 0, 0, &p0
,
2221 TS_RUN
, minclsyspri
);
2222 snfq
->snfq_svc_threads
++;
2224 if (snfq
->snfq_req_head
== NULL
) {
2225 snfq
->snfq_req_head
= snfq
->snfq_req_tail
= sr
;
2226 cv_signal(&snfq
->snfq_cv
);
2228 snfq
->snfq_req_tail
->sr_next
= sr
;
2229 snfq
->snfq_req_tail
= sr
;
2231 snfq
->snfq_req_cnt
++;
2232 mutex_exit(&snfq
->snfq_lock
);
2237 snf_direct_io(file_t
*fp
, file_t
*rfp
, uoff_t fileoff
, uoff_t size
,
2251 bzero(&msg
, sizeof (msg
));
2255 if ((sr
= create_thread(READ_OP
, vp
, rfp
, fileoff
, size
)) == NULL
)
2259 * We check for read error in snf_deque. It has to check
2260 * for successful READ_DONE and return NULL, and we might
2261 * as well make an additional check there.
2263 while ((mp
= snf_deque(sr
)) != NULL
) {
2265 if (ISSIG(curthread
, JUSTLOOKING
)) {
2272 error
= socket_sendmblk(VTOSO(vp
), &msg
, fflag
, CRED(), &mp
);
2283 mutex_enter(&sr
->sr_lock
);
2284 sr
->sr_write_error
= error
;
2285 /* Look at the big comments on why we cv_signal here. */
2286 cv_signal(&sr
->sr_cv
);
2288 /* Wait for the reader to complete always. */
2289 while (!(sr
->sr_read_error
& SR_READ_DONE
)) {
2290 cv_wait(&sr
->sr_cv
, &sr
->sr_lock
);
2292 /* If there is no write error, check for read error. */
2294 error
= (sr
->sr_read_error
& ~SR_READ_DONE
);
2299 mp
= sr
->sr_mp_head
;
2300 while (mp
!= NULL
) {
2301 next_mp
= mp
->b_next
;
2307 mutex_exit(&sr
->sr_lock
);
2308 kmem_free(sr
, sizeof (snf_req_t
));
2312 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2313 #define SNF_VPMMAXPGS (VPMMAXPGS/2)
2316 * Maximum no.of elements in the list returned by vpm, including
2317 * NULL for the last entry
2319 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1)
2322 unsigned int snfv_ref
;
2325 struct vmap snfv_vml
[SNF_MAXVMAPS
];
2326 } snf_vmap_desbinfo
;
2334 } snf_smap_desbinfo
;
2337 * The callback function used for vpm mapped mblks called when the last ref of
2338 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2339 * can be the driver too due to lazy reclaim.
2342 snf_vmap_desbfree(snf_vmap_desbinfo
*snfv
)
2344 ASSERT(snfv
->snfv_ref
!= 0);
2345 if (atomic_dec_32_nv(&snfv
->snfv_ref
) == 0) {
2346 vpm_unmap_pages(snfv
->snfv_vml
, S_READ
);
2347 VN_RELE(snfv
->snfv_vp
);
2348 kmem_free(snfv
, sizeof (snf_vmap_desbinfo
));
2353 * The callback function used for segmap'ped mblks called when the last ref of
2354 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2355 * can be the driver too due to lazy reclaim.
2358 snf_smap_desbfree(snf_smap_desbinfo
*snfi
)
2360 if (! IS_KPM_ADDR(snfi
->snfi_base
)) {
2362 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2363 * segmap_kpm as long as the latter never falls back to
2364 * "use_segmap_range". (See segmap_getmapflt().)
2366 * Using S_OTHER saves an redundant hat_setref() in
2369 (void) segmap_fault(kas
.a_hat
, segkmap
,
2370 (caddr_t
)(uintptr_t)(((uintptr_t)snfi
->snfi_base
+
2371 snfi
->snfi_mapoff
) & PAGEMASK
), snfi
->snfi_len
,
2372 F_SOFTUNLOCK
, S_OTHER
);
2374 (void) segmap_release(segkmap
, snfi
->snfi_base
, SM_DONTNEED
);
2375 VN_RELE(snfi
->snfi_vp
);
2376 kmem_free(snfi
, sizeof (*snfi
));
2380 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2381 * When segmap is used, the mblk contains a segmap slot of no more
2384 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2385 * in each iteration and sent by socket_sendmblk until an error occurs or
2386 * the requested size has been transferred. An mblk is esballoca'ed from
2387 * each mapped page and a chain of these mblk is sent to the transport layer.
2388 * vpm will be called to unmap the pages when all mblks have been freed by
2391 * At the end of the whole sendfile() operation, we wait till the data from
2392 * the last mblk is ack'ed by the transport before returning so that the
2393 * caller of sendfile() can safely modify the file content.
2396 snf_segmap(file_t
*fp
, vnode_t
*fvp
, uoff_t fileoff
, uoff_t total_size
,
2397 ssize_t
*count
, boolean_t nowait
)
2405 clock_t deadlk_wait
;
2409 boolean_t dowait
= B_FALSE
;
2415 bzero(&msg
, sizeof (msg
));
2418 if (ISSIG(curthread
, JUSTLOOKING
)) {
2424 snf_vmap_desbinfo
*snfv
;
2430 mapoff
= fileoff
& PAGEOFFSET
;
2431 maxsize
= MIN((SNF_VPMMAXPGS
* PAGESIZE
), total_size
);
2433 snfv
= kmem_zalloc(sizeof (snf_vmap_desbinfo
),
2437 * Get vpm mappings for maxsize with read access.
2438 * If the pages aren't available yet, we get
2439 * DEADLK, so wait and try again a little later using
2440 * an increasing wait. We might be here a long time.
2442 * If delay_sig returns EINTR, be sure to exit and
2443 * pass it up to the caller.
2446 while ((error
= vpm_map_pages(fvp
, fileoff
,
2447 (size_t)maxsize
, (VPM_FETCHPAGE
), snfv
->snfv_vml
,
2448 SNF_MAXVMAPS
, NULL
, S_READ
)) == EDEADLK
) {
2449 deadlk_wait
+= (deadlk_wait
< 5) ? 1 : 4;
2450 if ((error
= delay_sig(deadlk_wait
)) != 0) {
2455 kmem_free(snfv
, sizeof (snf_vmap_desbinfo
));
2456 error
= (error
== EINTR
) ? EINTR
: EIO
;
2459 snfv
->snfv_frtn
.free_func
= snf_vmap_desbfree
;
2460 snfv
->snfv_frtn
.free_arg
= (caddr_t
)snfv
;
2462 /* Construct the mblk chain from the page mappings */
2464 for (i
= 0; (snfv
->snfv_vml
[i
].vs_addr
!= NULL
) &&
2465 total_size
> 0; i
++) {
2466 ASSERT(chain_size
< maxsize
);
2467 mblk_size
= MIN(snfv
->snfv_vml
[i
].vs_len
-
2468 mapoff
, total_size
);
2470 (uchar_t
*)snfv
->snfv_vml
[i
].vs_addr
+
2471 mapoff
, mblk_size
, BPRI_HI
,
2475 * We return EAGAIN after unmapping the pages
2476 * if we cannot allocate the the head of the
2477 * chain. Otherwise, we continue sending the
2478 * mblks constructed so far.
2482 vpm_unmap_pages(snfv
->snfv_vml
,
2485 sizeof (snf_vmap_desbinfo
));
2491 /* Mark this dblk with the zero-copy flag */
2492 nmp
->b_datap
->db_struioflag
|= STRUIO_ZC
;
2493 nmp
->b_wptr
+= mblk_size
;
2494 chain_size
+= mblk_size
;
2495 fileoff
+= mblk_size
;
2496 total_size
-= mblk_size
;
2505 snfv
->snfv_vp
= fvp
;
2507 /* vpm not supported. fallback to segmap */
2508 snf_smap_desbinfo
*snfi
;
2510 mapoff
= fileoff
& MAXBOFFSET
;
2511 chain_size
= MAXBSIZE
- mapoff
;
2512 if (chain_size
> total_size
)
2513 chain_size
= total_size
;
2515 * we don't forcefault because we'll call
2516 * segmap_fault(F_SOFTLOCK) next.
2518 * S_READ will get the ref bit set (by either
2519 * segmap_getmapflt() or segmap_fault()) and page
2522 base
= segmap_getmapflt(segkmap
, fvp
, fileoff
,
2523 chain_size
, segmap_kpm
? SM_FAULT
: 0, S_READ
);
2525 snfi
= kmem_alloc(sizeof (*snfi
), KM_SLEEP
);
2526 snfi
->snfi_len
= (size_t)roundup(mapoff
+chain_size
,
2527 PAGESIZE
)- (mapoff
& PAGEMASK
);
2529 * We must call segmap_fault() even for segmap_kpm
2530 * because that's how error gets returned.
2531 * (segmap_getmapflt() never fails but segmap_fault()
2534 * If the pages aren't available yet, we get
2535 * DEADLK, so wait and try again a little later using
2536 * an increasing wait. We might be here a long time.
2538 * If delay_sig returns EINTR, be sure to exit and
2539 * pass it up to the caller.
2542 while ((error
= FC_ERRNO(segmap_fault(kas
.a_hat
,
2543 segkmap
, (caddr_t
)(uintptr_t)(((uintptr_t)base
+
2544 mapoff
) & PAGEMASK
), snfi
->snfi_len
, F_SOFTLOCK
,
2545 S_READ
))) == EDEADLK
) {
2546 deadlk_wait
+= (deadlk_wait
< 5) ? 1 : 4;
2547 if ((error
= delay_sig(deadlk_wait
)) != 0) {
2552 (void) segmap_release(segkmap
, base
, 0);
2553 kmem_free(snfi
, sizeof (*snfi
));
2554 error
= (error
== EINTR
) ? EINTR
: EIO
;
2557 snfi
->snfi_frtn
.free_func
= snf_smap_desbfree
;
2558 snfi
->snfi_frtn
.free_arg
= (caddr_t
)snfi
;
2559 snfi
->snfi_base
= base
;
2560 snfi
->snfi_mapoff
= mapoff
;
2561 mp
= esballoca((uchar_t
*)base
+ mapoff
, chain_size
,
2562 BPRI_HI
, &snfi
->snfi_frtn
);
2565 (void) segmap_fault(kas
.a_hat
, segkmap
,
2566 (caddr_t
)(uintptr_t)(((uintptr_t)base
+
2567 mapoff
) & PAGEMASK
), snfi
->snfi_len
,
2568 F_SOFTUNLOCK
, S_OTHER
);
2569 (void) segmap_release(segkmap
, base
, 0);
2570 kmem_free(snfi
, sizeof (*snfi
));
2576 snfi
->snfi_vp
= fvp
;
2577 mp
->b_wptr
+= chain_size
;
2579 /* Mark this dblk with the zero-copy flag */
2580 mp
->b_datap
->db_struioflag
|= STRUIO_ZC
;
2581 fileoff
+= chain_size
;
2582 total_size
-= chain_size
;
2585 if (total_size
== 0 && !nowait
) {
2588 mp
->b_datap
->db_struioflag
|= STRUIO_ZCNOTIFY
;
2590 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2591 error
= socket_sendmblk(VTOSO(vp
), &msg
, fflag
, CRED(), &mp
);
2594 * mp contains the mblks that were not sent by
2595 * socket_sendmblk. Use its size to update *count
2597 *count
= ksize
+ (chain_size
- msgdsize(mp
));
2602 ksize
+= chain_size
;
2603 if (total_size
== 0)
2606 (void) fop_rwlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2607 va
.va_mask
= AT_SIZE
;
2608 error
= fop_getattr(fvp
, &va
, 0, kcred
, NULL
);
2611 /* Read as much as possible. */
2612 if (fileoff
>= va
.va_size
)
2614 if (total_size
+ fileoff
> va
.va_size
)
2615 total_size
= va
.va_size
- fileoff
;
2618 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2628 error
= so_zcopy_wait(so
);
2630 mutex_enter(&stp
->sd_lock
);
2631 while (!(stp
->sd_flag
& STZCNOTIFY
)) {
2632 if (cv_wait_sig(&stp
->sd_zcopy_wait
,
2633 &stp
->sd_lock
) == 0) {
2638 stp
->sd_flag
&= ~STZCNOTIFY
;
2639 mutex_exit(&stp
->sd_lock
);
2646 snf_cache(file_t
*fp
, vnode_t
*fvp
, uoff_t fileoff
, uoff_t size
,
2647 uint_t maxpsz
, ssize_t
*count
)
2666 if (vp
->v_type
== VSOCK
) {
2670 * Get the extra space to insert a header and a trailer.
2675 wroff
= so
->so_proto_props
.sopp_wroff
;
2676 maxblk
= so
->so_proto_props
.sopp_maxblk
;
2677 extra
= wroff
+ so
->so_proto_props
.sopp_tail
;
2679 wroff
= (int)(stp
->sd_wroff
);
2680 maxblk
= (int)(stp
->sd_maxblk
);
2681 extra
= wroff
+ (int)(stp
->sd_tail
);
2684 bzero(&msg
, sizeof (msg
));
2687 auio
.uio_iov
= &aiov
;
2688 auio
.uio_iovcnt
= 1;
2689 auio
.uio_segflg
= UIO_SYSSPACE
;
2690 auio
.uio_llimit
= MAXOFFSET_T
;
2691 auio
.uio_fmode
= fflag
;
2692 auio
.uio_extflg
= UIO_COPY_CACHED
;
2693 ioflag
= auio
.uio_fmode
& (FSYNC
|FDSYNC
|FRSYNC
);
2694 /* If read sync is not asked for, filter sync flags */
2695 if ((ioflag
& FRSYNC
) == 0)
2696 ioflag
&= ~(FSYNC
|FDSYNC
);
2698 if (ISSIG(curthread
, JUSTLOOKING
)) {
2702 iosize
= (int)MIN(maxpsz
, size
);
2705 * Socket filters can limit the mblk size,
2706 * so limit reads to maxblk if there are
2709 if (vp
->v_type
== VSOCK
&&
2710 so
->so_filter_active
> 0 && maxblk
!= INFPSZ
)
2711 iosize
= (int)MIN(iosize
, maxblk
);
2713 mp
= allocb(iosize
+ extra
, BPRI_MED
);
2719 mp
->b_rptr
+= wroff
;
2721 aiov
.iov_base
= (caddr_t
)mp
->b_rptr
;
2722 aiov
.iov_len
= iosize
;
2723 auio
.uio_loffset
= fileoff
;
2724 auio
.uio_resid
= iosize
;
2726 error
= fop_read(fvp
, &auio
, ioflag
, fp
->f_cred
, NULL
);
2727 iosize
-= auio
.uio_resid
;
2729 if (error
== EINTR
&& iosize
!= 0)
2732 if (error
!= 0 || iosize
== 0) {
2736 mp
->b_wptr
= mp
->b_rptr
+ iosize
;
2738 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2740 error
= socket_sendmblk(VTOSO(vp
), &msg
, fflag
, CRED(), &mp
);
2754 (void) fop_rwlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2755 va
.va_mask
= AT_SIZE
;
2756 error
= fop_getattr(fvp
, &va
, 0, kcred
, NULL
);
2759 /* Read as much as possible. */
2760 if (fileoff
>= va
.va_size
)
2762 else if (size
+ fileoff
> va
.va_size
)
2763 size
= va
.va_size
- fileoff
;
2765 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2771 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2773 * Largefile support for 32 bit applications only.
2776 sosendfile64(file_t
*fp
, file_t
*rfp
, const struct ksendfilevec64
*sfv
,
2780 uoff_t sfv_off
, va_size
;
2781 struct vnode
*vp
, *fvp
, *realvp
;
2786 boolean_t dozcopy
= B_FALSE
;
2789 sfv_len
= (ssize32_t
)sfv
->sfv_len
;
2795 if (sfv_len
== 0) goto out
;
2797 sfv_off
= (uoff_t
)sfv
->sfv_off
;
2799 /* Same checks as in pread */
2800 if (sfv_off
> MAXOFFSET_T
) {
2804 if (sfv_off
+ sfv_len
> MAXOFFSET_T
)
2805 sfv_len
= (ssize32_t
)(MAXOFFSET_T
- sfv_off
);
2808 * There are no more checks on sfv_len. So, we cast it to
2809 * uoff_t and share the snf_direct_io/snf_cache code between
2810 * 32 bit and 64 bit.
2812 * TODO: should do nbl_need_check() like read()?
2814 if (sfv_len
> sendfile_max_size
) {
2815 sf_stats
.ss_file_not_cached
++;
2816 error
= snf_direct_io(fp
, rfp
, sfv_off
, (uoff_t
)sfv_len
,
2821 if (fop_realvp(fvp
, &realvp
, NULL
) == 0)
2824 * Grab the lock as a reader to prevent the file size
2825 * from changing underneath.
2827 (void) fop_rwlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2828 va
.va_mask
= AT_SIZE
;
2829 error
= fop_getattr(fvp
, &va
, 0, kcred
, NULL
);
2830 va_size
= va
.va_size
;
2831 if ((error
!= 0) || (va_size
== 0) || (sfv_off
>= va_size
)) {
2832 fop_rwunlock(fvp
, V_WRITELOCK_FALSE
, NULL
);
2835 /* Read as much as possible. */
2836 if (sfv_off
+ sfv_len
> va_size
)
2837 sfv_len
= va_size
- sfv_off
;
2842 * When the NOWAIT flag is not set, we enable zero-copy only if the
2843 * transfer size is large enough. This prevents performance loss
2844 * when the caller sends the file piece by piece.
2846 if (sfv_len
>= MAXBSIZE
&& (sfv_len
>= (va_size
>> 1) ||
2847 (sfv
->sfv_flag
& SFV_NOWAIT
) || sfv_len
>= 0x1000000) &&
2848 !vn_has_flocks(fvp
) && !(fvp
->v_flag
& VNOMAP
)) {
2850 copyflag
= stp
!= NULL
? stp
->sd_copyflag
:
2851 VTOSO(vp
)->so_proto_props
.sopp_zcopyflag
;
2852 if ((copyflag
& (STZCVMSAFE
|STZCVMUNSAFE
)) == 0) {
2855 if (socket_setsockopt(VTOSO(vp
), SOL_SOCKET
,
2856 SO_SND_COPYAVOID
, &on
, sizeof (on
), CRED()) == 0)
2859 dozcopy
= copyflag
& STZCVMSAFE
;
2863 sf_stats
.ss_file_segmap
++;
2864 error
= snf_segmap(fp
, fvp
, sfv_off
, (uoff_t
)sfv_len
,
2865 &count
, ((sfv
->sfv_flag
& SFV_NOWAIT
) != 0));
2867 if (vp
->v_type
== VSOCK
&& stp
== NULL
) {
2868 sonode_t
*so
= VTOSO(vp
);
2869 maxpsz
= so
->so_proto_props
.sopp_maxpsz
;
2870 } else if (stp
!= NULL
) {
2871 maxpsz
= stp
->sd_qn_maxpsz
;
2876 if (maxpsz
== INFPSZ
)
2879 maxpsz
= roundup(maxpsz
, MAXBSIZE
);
2880 sf_stats
.ss_file_cached
++;
2881 error
= snf_cache(fp
, fvp
, sfv_off
, (uoff_t
)sfv_len
,
2885 releasef(sfv
->sfv_fd
);
2886 *count32
= (ssize32_t
)count
;
2891 #ifdef _SYSCALL32_IMPL
2893 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2894 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2898 recv32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
)
2900 return (recv(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
));
2904 recvfrom32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
,
2905 caddr32_t name
, caddr32_t namelenp
)
2907 return (recvfrom(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
,
2908 (void *)(uintptr_t)name
, (void *)(uintptr_t)namelenp
));
2912 send32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
)
2914 return (send(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
));
2918 sendto32(int32_t sock
, caddr32_t buffer
, size32_t len
, int32_t flags
,
2919 caddr32_t name
, socklen_t namelen
)
2921 return (sendto(sock
, (void *)(uintptr_t)buffer
, (ssize32_t
)len
, flags
,
2922 (void *)(uintptr_t)name
, namelen
));
2924 #endif /* _SYSCALL32_IMPL */
2927 * Function wrappers (mostly around the sonode switch) for
2928 * backward compatibility.
2932 soaccept(struct sonode
*so
, int fflag
, struct sonode
**nsop
)
2934 return (socket_accept(so
, fflag
, CRED(), nsop
));
2938 sobind(struct sonode
*so
, struct sockaddr
*name
, socklen_t namelen
,
2939 int backlog
, int flags
)
2943 error
= socket_bind(so
, name
, namelen
, flags
, CRED());
2944 if (error
== 0 && backlog
!= 0)
2945 return (socket_listen(so
, backlog
, CRED()));
2951 solisten(struct sonode
*so
, int backlog
)
2953 return (socket_listen(so
, backlog
, CRED()));
2957 soconnect(struct sonode
*so
, struct sockaddr
*name
, socklen_t namelen
,
2958 int fflag
, int flags
)
2960 return (socket_connect(so
, name
, namelen
, fflag
, flags
, CRED()));
2964 sorecvmsg(struct sonode
*so
, struct msghdr
*msg
, struct uio
*uiop
)
2966 return (socket_recvmsg(so
, msg
, uiop
, CRED()));
2970 sosendmsg(struct sonode
*so
, struct msghdr
*msg
, struct uio
*uiop
)
2972 return (socket_sendmsg(so
, msg
, uiop
, CRED()));
2976 soshutdown(struct sonode
*so
, int how
)
2978 return (socket_shutdown(so
, how
, CRED()));
2982 sogetsockopt(struct sonode
*so
, int level
, int option_name
, void *optval
,
2983 socklen_t
*optlenp
, int flags
)
2985 return (socket_getsockopt(so
, level
, option_name
, optval
, optlenp
,
2990 sosetsockopt(struct sonode
*so
, int level
, int option_name
, const void *optval
,
2993 return (socket_setsockopt(so
, level
, option_name
, optval
, optlen
,
2998 * Because this is backward compatibility interface it only needs to be
2999 * able to handle the creation of TPI sockfs sockets.
3002 socreate(struct sockparams
*sp
, int family
, int type
, int protocol
,
3009 so
= sp
->sp_smod_info
->smod_sock_create_func(sp
, family
, type
, protocol
,
3010 SOCKET_SLEEP
, errorp
, CRED());
3012 SOCKPARAMS_DEC_REF(sp
);
3014 if ((*errorp
= SOP_INIT(so
, NULL
, CRED(), SOCKET_SLEEP
)) == 0) {
3015 /* Cannot fail, only bumps so_count */
3016 (void) fop_open(&SOTOV(so
), FREAD
|FWRITE
, CRED(), NULL
);