fm/ipmitopo: fix 64-bit compilation
[unleashed.git] / kernel / syscall / sendfile.c
blobb4a6940260817659f38ca926b18729787680c372
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/buf.h>
31 #include <sys/conf.h>
32 #include <sys/cred.h>
33 #include <sys/kmem.h>
34 #include <sys/sysmacros.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/time.h>
40 #include <sys/file.h>
41 #include <sys/open.h>
42 #include <sys/user.h>
43 #include <sys/termios.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/sunddi.h>
47 #include <sys/esunddi.h>
48 #include <sys/flock.h>
49 #include <sys/modctl.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vmsystm.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sockfs/sockcommon.h>
56 #include <sockfs/socktpi.h>
58 #include <netinet/in.h>
59 #include <sys/sendfile.h>
60 #include <sys/un.h>
61 #include <sys/tihdr.h>
62 #include <sys/atomic.h>
64 #include <inet/common.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/tcp.h>
69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
70 ssize32_t *);
71 extern int snf_segmap(file_t *, vnode_t *, uoff_t, uoff_t, ssize_t *,
72 boolean_t);
73 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
75 #define SEND_MAX_CHUNK 16
77 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
79 * 64 bit offsets for 32 bit applications only running either on
80 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
81 * more than 2GB of data.
83 static int
84 sendvec_chunk64(file_t *fp, uoff_t *fileoff, struct ksendfilevec64 *sfv,
85 int copy_cnt, ssize32_t *count)
87 struct vnode *vp;
88 ushort_t fflag;
89 int ioflag;
90 size32_t cnt;
91 ssize32_t sfv_len;
92 ssize32_t tmpcount;
93 uoff_t sfv_off;
94 struct uio auio;
95 struct iovec aiov;
96 int i, error;
98 fflag = fp->f_flag;
99 vp = fp->f_vnode;
100 for (i = 0; i < copy_cnt; i++) {
102 if (ISSIG(curthread, JUSTLOOKING))
103 return (EINTR);
106 * Do similar checks as "write" as we are writing
107 * sfv_len bytes into "vp".
109 sfv_len = (ssize32_t)sfv->sfv_len;
111 if (sfv_len == 0) {
112 sfv++;
113 continue;
116 if (sfv_len < 0)
117 return (EINVAL);
119 if (vp->v_type == VREG) {
120 if (*fileoff >= curproc->p_fsz_ctl) {
121 mutex_enter(&curproc->p_lock);
122 (void) rctl_action(
123 rctlproc_legacy[RLIMIT_FSIZE],
124 curproc->p_rctls, curproc, RCA_SAFE);
125 mutex_exit(&curproc->p_lock);
126 return (EFBIG);
129 if (*fileoff >= OFFSET_MAX(fp))
130 return (EFBIG);
132 if (*fileoff + sfv_len > OFFSET_MAX(fp))
133 return (EINVAL);
136 tmpcount = *count + sfv_len;
137 if (tmpcount < 0)
138 return (EINVAL);
140 sfv_off = sfv->sfv_off;
142 auio.uio_extflg = UIO_COPY_DEFAULT;
143 if (sfv->sfv_fd == SFV_FD_SELF) {
144 aiov.iov_len = sfv_len;
145 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
146 auio.uio_loffset = *fileoff;
147 auio.uio_iovcnt = 1;
148 auio.uio_resid = sfv_len;
149 auio.uio_iov = &aiov;
150 auio.uio_segflg = UIO_USERSPACE;
151 auio.uio_llimit = curproc->p_fsz_ctl;
152 auio.uio_fmode = fflag;
153 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
154 while (sfv_len > 0) {
155 error = fop_write(vp, &auio, ioflag,
156 fp->f_cred, NULL);
157 cnt = sfv_len - auio.uio_resid;
158 sfv_len -= cnt;
159 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
160 if (vp->v_type == VREG)
161 *fileoff += cnt;
162 *count += cnt;
163 if (error != 0)
164 return (error);
166 } else {
167 file_t *ffp;
168 vnode_t *readvp;
169 size_t size;
170 caddr_t ptr;
172 if ((ffp = getf(sfv->sfv_fd)) == NULL)
173 return (EBADF);
175 if ((ffp->f_flag & FREAD) == 0) {
176 releasef(sfv->sfv_fd);
177 return (EBADF);
180 readvp = ffp->f_vnode;
181 if (readvp->v_type != VREG) {
182 releasef(sfv->sfv_fd);
183 return (EINVAL);
187 * No point reading and writing to same vp,
188 * as long as both are regular files. readvp is not
189 * locked; but since we got it from an open file the
190 * contents will be valid during the time of access.
192 if (vn_compare(vp, readvp)) {
193 releasef(sfv->sfv_fd);
194 return (EINVAL);
198 * Optimize the regular file over
199 * the socket case.
201 if (vp->v_type == VSOCK) {
202 error = sosendfile64(fp, ffp, sfv,
203 (ssize32_t *)&cnt);
204 *count += cnt;
205 if (error)
206 return (error);
207 sfv++;
208 continue;
212 * Note: we assume readvp != vp. "vp" is already
213 * locked, and "readvp" must not be.
215 if (readvp < vp) {
216 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
217 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
218 NULL);
219 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
220 } else {
221 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
222 NULL);
226 * Same checks as in pread64.
228 if (sfv_off > MAXOFFSET_T) {
229 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
230 releasef(sfv->sfv_fd);
231 return (EINVAL);
234 if (sfv_off + sfv_len > MAXOFFSET_T)
235 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
237 /* Find the native blocksize to transfer data */
238 size = MIN(vp->v_vfsp->vfs_bsize,
239 readvp->v_vfsp->vfs_bsize);
240 size = sfv_len < size ? sfv_len : size;
241 ptr = kmem_alloc(size, KM_NOSLEEP);
242 if (ptr == NULL) {
243 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
244 releasef(sfv->sfv_fd);
245 return (ENOMEM);
248 while (sfv_len > 0) {
249 size_t iov_len;
251 iov_len = MIN(size, sfv_len);
252 aiov.iov_base = ptr;
253 aiov.iov_len = iov_len;
254 auio.uio_loffset = sfv_off;
255 auio.uio_iov = &aiov;
256 auio.uio_iovcnt = 1;
257 auio.uio_resid = iov_len;
258 auio.uio_segflg = UIO_SYSSPACE;
259 auio.uio_llimit = MAXOFFSET_T;
260 auio.uio_fmode = ffp->f_flag;
261 ioflag = auio.uio_fmode &
262 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
265 * If read sync is not asked for,
266 * filter sync flags
268 if ((ioflag & FRSYNC) == 0)
269 ioflag &= ~(FSYNC|FDSYNC);
270 error = fop_read(readvp, &auio, ioflag,
271 fp->f_cred, NULL);
272 if (error) {
273 kmem_free(ptr, size);
274 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
275 NULL);
276 releasef(sfv->sfv_fd);
277 return (error);
281 * Check how must data was really read.
282 * Decrement the 'len' and increment the
283 * 'off' appropriately.
285 cnt = iov_len - auio.uio_resid;
286 if (cnt == 0) {
288 * If we were reading a pipe (currently
289 * not implemented), we may now lose
290 * data.
292 kmem_free(ptr, size);
293 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
294 NULL);
295 releasef(sfv->sfv_fd);
296 return (EINVAL);
298 sfv_len -= cnt;
299 sfv_off += cnt;
301 aiov.iov_base = ptr;
302 aiov.iov_len = cnt;
303 auio.uio_loffset = *fileoff;
304 auio.uio_iov = &aiov;
305 auio.uio_iovcnt = 1;
306 auio.uio_resid = cnt;
307 auio.uio_segflg = UIO_SYSSPACE;
308 auio.uio_llimit = curproc->p_fsz_ctl;
309 auio.uio_fmode = fflag;
310 ioflag = auio.uio_fmode &
311 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
312 error = fop_write(vp, &auio, ioflag,
313 fp->f_cred, NULL);
316 * Check how much data was written. Increment
317 * the 'len' and decrement the 'off' if all
318 * the data was not written.
320 cnt -= auio.uio_resid;
321 sfv_len += auio.uio_resid;
322 sfv_off -= auio.uio_resid;
323 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
324 if (vp->v_type == VREG)
325 *fileoff += cnt;
326 *count += cnt;
327 if (error != 0) {
328 kmem_free(ptr, size);
329 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
330 NULL);
331 releasef(sfv->sfv_fd);
332 return (error);
335 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
336 releasef(sfv->sfv_fd);
337 kmem_free(ptr, size);
339 sfv++;
341 return (0);
344 static ssize32_t
345 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
346 size32_t *xferred, int fildes)
348 uoff_t fileoff;
349 int copy_cnt;
350 const struct ksendfilevec64 *copy_vec;
351 struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
352 struct vnode *vp;
353 int error;
354 ssize32_t count = 0;
356 vp = fp->f_vnode;
357 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
359 copy_vec = vec;
360 fileoff = fp->f_offset;
362 do {
363 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
364 if (copyin(copy_vec, sfv, copy_cnt *
365 sizeof (struct ksendfilevec64))) {
366 error = EFAULT;
367 break;
370 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
371 if (error != 0)
372 break;
374 copy_vec += copy_cnt;
375 sfvcnt -= copy_cnt;
376 } while (sfvcnt > 0);
378 if (vp->v_type == VREG)
379 fp->f_offset += count;
381 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
382 if (copyout(&count, xferred, sizeof (count)))
383 error = EFAULT;
384 releasef(fildes);
385 if (error != 0)
386 return (set_errno(error));
387 return (count);
389 #endif
391 static int
392 sendvec_small_chunk(file_t *fp, uoff_t *fileoff, struct sendfilevec *sfv,
393 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
395 struct vnode *vp;
396 struct uio auio;
397 struct iovec aiov;
398 ushort_t fflag;
399 int ioflag;
400 int i, error;
401 size_t cnt;
402 ssize_t sfv_len;
403 uoff_t sfv_off;
404 #ifdef _SYSCALL32_IMPL
405 model_t model = get_udatamodel();
406 uoff_t maxoff = (model == DATAMODEL_ILP32) ?
407 INT32_MAX : MAXOFFSET_T;
408 #else
409 const uoff_t maxoff = INT32_MAX;
410 #endif
411 mblk_t *dmp = NULL;
412 int wroff;
413 int buf_left = 0;
414 size_t iov_len;
415 mblk_t *head, *tmp;
416 size_t size = total_size;
417 size_t extra;
418 int tail_len;
419 struct msghdr msg;
421 fflag = fp->f_flag;
422 vp = fp->f_vnode;
424 ASSERT(vp->v_type == VSOCK);
425 ASSERT(maxblk > 0);
427 /* If nothing to send, return */
428 if (total_size == 0)
429 return (0);
431 if (vp->v_stream != NULL) {
432 wroff = (int)vp->v_stream->sd_wroff;
433 tail_len = (int)vp->v_stream->sd_tail;
434 } else {
435 struct sonode *so;
437 so = VTOSO(vp);
438 wroff = so->so_proto_props.sopp_wroff;
439 tail_len = so->so_proto_props.sopp_tail;
442 extra = wroff + tail_len;
444 buf_left = MIN(total_size, maxblk);
445 head = dmp = allocb(buf_left + extra, BPRI_HI);
446 if (head == NULL)
447 return (ENOMEM);
448 head->b_wptr = head->b_rptr = head->b_rptr + wroff;
449 bzero(&msg, sizeof (msg));
451 auio.uio_extflg = UIO_COPY_DEFAULT;
452 for (i = 0; i < copy_cnt; i++) {
453 if (ISSIG(curthread, JUSTLOOKING)) {
454 freemsg(head);
455 return (EINTR);
459 * Do similar checks as "write" as we are writing
460 * sfv_len bytes into "vp".
462 sfv_len = (ssize_t)sfv->sfv_len;
464 if (sfv_len == 0) {
465 sfv++;
466 continue;
469 /* Check for overflow */
470 #ifdef _SYSCALL32_IMPL
471 if (model == DATAMODEL_ILP32) {
472 if (((ssize32_t)(*count + sfv_len)) < 0) {
473 freemsg(head);
474 return (EINVAL);
476 } else
477 #endif
478 if ((*count + sfv_len) < 0) {
479 freemsg(head);
480 return (EINVAL);
483 sfv_off = (uoff_t)(ulong_t)sfv->sfv_off;
485 if (sfv->sfv_fd == SFV_FD_SELF) {
486 while (sfv_len > 0) {
487 if (buf_left == 0) {
488 tmp = dmp;
489 buf_left = MIN(total_size, maxblk);
490 iov_len = MIN(buf_left, sfv_len);
491 dmp = allocb(buf_left + extra, BPRI_HI);
492 if (dmp == NULL) {
493 freemsg(head);
494 return (ENOMEM);
496 dmp->b_wptr = dmp->b_rptr =
497 dmp->b_rptr + wroff;
498 tmp->b_cont = dmp;
499 } else {
500 iov_len = MIN(buf_left, sfv_len);
503 aiov.iov_len = iov_len;
504 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
505 auio.uio_loffset = *fileoff;
506 auio.uio_iovcnt = 1;
507 auio.uio_resid = iov_len;
508 auio.uio_iov = &aiov;
509 auio.uio_segflg = UIO_USERSPACE;
510 auio.uio_llimit = curproc->p_fsz_ctl;
511 auio.uio_fmode = fflag;
513 buf_left -= iov_len;
514 total_size -= iov_len;
515 sfv_len -= iov_len;
516 sfv_off += iov_len;
518 error = uiomove((caddr_t)dmp->b_wptr,
519 iov_len, UIO_WRITE, &auio);
520 if (error != 0) {
521 freemsg(head);
522 return (error);
524 dmp->b_wptr += iov_len;
526 } else {
527 file_t *ffp;
528 vnode_t *readvp;
530 if ((ffp = getf(sfv->sfv_fd)) == NULL) {
531 freemsg(head);
532 return (EBADF);
535 if ((ffp->f_flag & FREAD) == 0) {
536 releasef(sfv->sfv_fd);
537 freemsg(head);
538 return (EACCES);
541 readvp = ffp->f_vnode;
542 if (readvp->v_type != VREG) {
543 releasef(sfv->sfv_fd);
544 freemsg(head);
545 return (EINVAL);
549 * No point reading and writing to same vp,
550 * as long as both are regular files. readvp is not
551 * locked; but since we got it from an open file the
552 * contents will be valid during the time of access.
555 if (vn_compare(vp, readvp)) {
556 releasef(sfv->sfv_fd);
557 freemsg(head);
558 return (EINVAL);
562 * Note: we assume readvp != vp. "vp" is already
563 * locked, and "readvp" must not be.
566 if (readvp < vp) {
567 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
568 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
569 NULL);
570 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
571 } else {
572 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
573 NULL);
576 /* Same checks as in pread */
577 if (sfv_off > maxoff) {
578 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
579 releasef(sfv->sfv_fd);
580 freemsg(head);
581 return (EINVAL);
583 if (sfv_off + sfv_len > maxoff) {
584 total_size -= (sfv_off + sfv_len - maxoff);
585 sfv_len = (ssize_t)((offset_t)maxoff -
586 sfv_off);
589 while (sfv_len > 0) {
590 if (buf_left == 0) {
591 tmp = dmp;
592 buf_left = MIN(total_size, maxblk);
593 iov_len = MIN(buf_left, sfv_len);
594 dmp = allocb(buf_left + extra, BPRI_HI);
595 if (dmp == NULL) {
596 fop_rwunlock(readvp,
597 V_WRITELOCK_FALSE, NULL);
598 releasef(sfv->sfv_fd);
599 freemsg(head);
600 return (ENOMEM);
602 dmp->b_wptr = dmp->b_rptr =
603 dmp->b_rptr + wroff;
604 tmp->b_cont = dmp;
605 } else {
606 iov_len = MIN(buf_left, sfv_len);
608 aiov.iov_base = (caddr_t)dmp->b_wptr;
609 aiov.iov_len = iov_len;
610 auio.uio_loffset = sfv_off;
611 auio.uio_iov = &aiov;
612 auio.uio_iovcnt = 1;
613 auio.uio_resid = iov_len;
614 auio.uio_segflg = UIO_SYSSPACE;
615 auio.uio_llimit = MAXOFFSET_T;
616 auio.uio_fmode = ffp->f_flag;
617 ioflag = auio.uio_fmode &
618 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
621 * If read sync is not asked for,
622 * filter sync flags
624 if ((ioflag & FRSYNC) == 0)
625 ioflag &= ~(FSYNC|FDSYNC);
626 error = fop_read(readvp, &auio, ioflag,
627 fp->f_cred, NULL);
628 if (error != 0) {
630 * If we were reading a pipe (currently
631 * not implemented), we may now loose
632 * data.
634 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
635 NULL);
636 releasef(sfv->sfv_fd);
637 freemsg(head);
638 return (error);
642 * Check how much data was really read.
643 * Decrement the 'len' and increment the
644 * 'off' appropriately.
646 cnt = iov_len - auio.uio_resid;
647 if (cnt == 0) {
648 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
649 NULL);
650 releasef(sfv->sfv_fd);
651 freemsg(head);
652 return (EINVAL);
654 sfv_len -= cnt;
655 sfv_off += cnt;
656 total_size -= cnt;
657 buf_left -= cnt;
659 dmp->b_wptr += cnt;
661 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
662 releasef(sfv->sfv_fd);
664 sfv++;
667 ASSERT(total_size == 0);
668 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
669 if (error != 0) {
670 if (head != NULL)
671 freemsg(head);
672 return (error);
674 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
675 *count += size;
677 return (0);
681 static int
682 sendvec_chunk(file_t *fp, uoff_t *fileoff, struct sendfilevec *sfv,
683 int copy_cnt, ssize_t *count)
685 struct vnode *vp;
686 struct uio auio;
687 struct iovec aiov;
688 ushort_t fflag;
689 int ioflag;
690 int i, error;
691 size_t cnt;
692 ssize_t sfv_len;
693 uoff_t sfv_off;
694 #ifdef _SYSCALL32_IMPL
695 model_t model = get_udatamodel();
696 uoff_t maxoff = (model == DATAMODEL_ILP32) ?
697 INT32_MAX : MAXOFFSET_T;
698 #else
699 const uoff_t maxoff = INT32_MAX;
700 #endif
701 mblk_t *dmp = NULL;
702 char *buf = NULL;
703 size_t extra;
704 int maxblk, wroff, tail_len;
705 struct sonode *so;
706 stdata_t *stp;
707 struct msghdr msg;
709 fflag = fp->f_flag;
710 vp = fp->f_vnode;
712 if (vp->v_type == VSOCK) {
713 so = VTOSO(vp);
714 if (vp->v_stream != NULL) {
715 stp = vp->v_stream;
716 wroff = (int)stp->sd_wroff;
717 tail_len = (int)stp->sd_tail;
718 maxblk = (int)stp->sd_maxblk;
719 } else {
720 stp = NULL;
721 wroff = so->so_proto_props.sopp_wroff;
722 tail_len = so->so_proto_props.sopp_tail;
723 maxblk = so->so_proto_props.sopp_maxblk;
725 extra = wroff + tail_len;
728 bzero(&msg, sizeof (msg));
729 auio.uio_extflg = UIO_COPY_DEFAULT;
730 for (i = 0; i < copy_cnt; i++) {
731 if (ISSIG(curthread, JUSTLOOKING))
732 return (EINTR);
735 * Do similar checks as "write" as we are writing
736 * sfv_len bytes into "vp".
738 sfv_len = (ssize_t)sfv->sfv_len;
740 if (sfv_len == 0) {
741 sfv++;
742 continue;
745 if (vp->v_type == VREG) {
746 if (*fileoff >= curproc->p_fsz_ctl) {
747 mutex_enter(&curproc->p_lock);
748 (void) rctl_action(
749 rctlproc_legacy[RLIMIT_FSIZE],
750 curproc->p_rctls, curproc, RCA_SAFE);
751 mutex_exit(&curproc->p_lock);
753 return (EFBIG);
756 if (*fileoff >= maxoff)
757 return (EFBIG);
759 if (*fileoff + sfv_len > maxoff)
760 return (EINVAL);
763 /* Check for overflow */
764 #ifdef _SYSCALL32_IMPL
765 if (model == DATAMODEL_ILP32) {
766 if (((ssize32_t)(*count + sfv_len)) < 0)
767 return (EINVAL);
768 } else
769 #endif
770 if ((*count + sfv_len) < 0)
771 return (EINVAL);
773 sfv_off = (uoff_t)(ulong_t)sfv->sfv_off;
775 if (sfv->sfv_fd == SFV_FD_SELF) {
776 if (vp->v_type == VSOCK) {
777 while (sfv_len > 0) {
778 size_t iov_len;
780 iov_len = sfv_len;
782 * Socket filters can limit the mblk
783 * size, so limit reads to maxblk if
784 * there are filters present.
786 if (so->so_filter_active > 0 &&
787 maxblk != INFPSZ)
788 iov_len = MIN(iov_len, maxblk);
790 aiov.iov_len = iov_len;
791 aiov.iov_base =
792 (caddr_t)(uintptr_t)sfv_off;
794 auio.uio_iov = &aiov;
795 auio.uio_iovcnt = 1;
796 auio.uio_loffset = *fileoff;
797 auio.uio_segflg = UIO_USERSPACE;
798 auio.uio_fmode = fflag;
799 auio.uio_llimit = curproc->p_fsz_ctl;
800 auio.uio_resid = iov_len;
802 dmp = allocb(iov_len + extra, BPRI_HI);
803 if (dmp == NULL)
804 return (ENOMEM);
805 dmp->b_wptr = dmp->b_rptr =
806 dmp->b_rptr + wroff;
807 error = uiomove((caddr_t)dmp->b_wptr,
808 iov_len, UIO_WRITE, &auio);
809 if (error != 0) {
810 freeb(dmp);
811 return (error);
813 dmp->b_wptr += iov_len;
814 error = socket_sendmblk(VTOSO(vp),
815 &msg, fflag, CRED(), &dmp);
817 if (error != 0) {
818 if (dmp != NULL)
819 freeb(dmp);
820 return (error);
822 ttolwp(curthread)->lwp_ru.ioch +=
823 (ulong_t)iov_len;
824 *count += iov_len;
825 sfv_len -= iov_len;
826 sfv_off += iov_len;
828 } else {
829 aiov.iov_len = sfv_len;
830 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
832 auio.uio_iov = &aiov;
833 auio.uio_iovcnt = 1;
834 auio.uio_loffset = *fileoff;
835 auio.uio_segflg = UIO_USERSPACE;
836 auio.uio_fmode = fflag;
837 auio.uio_llimit = curproc->p_fsz_ctl;
838 auio.uio_resid = sfv_len;
840 ioflag = auio.uio_fmode &
841 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
842 while (sfv_len > 0) {
843 error = fop_write(vp, &auio, ioflag,
844 fp->f_cred, NULL);
845 cnt = sfv_len - auio.uio_resid;
846 sfv_len -= cnt;
847 ttolwp(curthread)->lwp_ru.ioch +=
848 (ulong_t)cnt;
849 *fileoff += cnt;
850 *count += cnt;
851 if (error != 0)
852 return (error);
855 } else {
856 int segmapit = 0;
857 file_t *ffp;
858 vnode_t *readvp;
859 struct vnode *realvp;
860 size_t size;
861 caddr_t ptr;
863 if ((ffp = getf(sfv->sfv_fd)) == NULL)
864 return (EBADF);
866 if ((ffp->f_flag & FREAD) == 0) {
867 releasef(sfv->sfv_fd);
868 return (EBADF);
871 readvp = ffp->f_vnode;
872 if (fop_realvp(readvp, &realvp, NULL) == 0)
873 readvp = realvp;
874 if (readvp->v_type != VREG) {
875 releasef(sfv->sfv_fd);
876 return (EINVAL);
880 * No point reading and writing to same vp,
881 * as long as both are regular files. readvp is not
882 * locked; but since we got it from an open file the
883 * contents will be valid during the time of access.
885 if (vn_compare(vp, readvp)) {
886 releasef(sfv->sfv_fd);
887 return (EINVAL);
891 * Note: we assume readvp != vp. "vp" is already
892 * locked, and "readvp" must not be.
894 if (readvp < vp) {
895 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
896 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
897 NULL);
898 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
899 } else {
900 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
901 NULL);
904 /* Same checks as in pread */
905 if (sfv_off > maxoff) {
906 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
907 releasef(sfv->sfv_fd);
908 return (EINVAL);
910 if (sfv_off + sfv_len > maxoff) {
911 sfv_len = (ssize_t)((offset_t)maxoff -
912 sfv_off);
914 /* Find the native blocksize to transfer data */
915 size = MIN(vp->v_vfsp->vfs_bsize,
916 readvp->v_vfsp->vfs_bsize);
917 size = sfv_len < size ? sfv_len : size;
919 if (vp->v_type != VSOCK) {
920 segmapit = 0;
921 buf = kmem_alloc(size, KM_NOSLEEP);
922 if (buf == NULL) {
923 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
924 NULL);
925 releasef(sfv->sfv_fd);
926 return (ENOMEM);
928 } else {
929 uint_t copyflag;
931 copyflag = stp != NULL ? stp->sd_copyflag :
932 so->so_proto_props.sopp_zcopyflag;
935 * Socket filters can limit the mblk size,
936 * so limit reads to maxblk if there are
937 * filters present.
939 if (so->so_filter_active > 0 &&
940 maxblk != INFPSZ)
941 size = MIN(size, maxblk);
943 if (vn_has_flocks(readvp) ||
944 readvp->v_flag & VNOMAP ||
945 copyflag & STZCVMUNSAFE) {
946 segmapit = 0;
947 } else if (copyflag & STZCVMSAFE) {
948 segmapit = 1;
949 } else {
950 int on = 1;
951 if (socket_setsockopt(VTOSO(vp),
952 SOL_SOCKET, SO_SND_COPYAVOID,
953 &on, sizeof (on), CRED()) == 0)
954 segmapit = 1;
958 if (segmapit) {
959 struct vattr va;
960 boolean_t nowait;
962 va.va_mask = VATTR_SIZE;
963 error = fop_getattr(readvp, &va, 0, kcred,
964 NULL);
965 if (error != 0 || sfv_off >= va.va_size) {
966 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
967 NULL);
968 releasef(sfv->sfv_fd);
969 return (error);
971 /* Read as much as possible. */
972 if (sfv_off + sfv_len > va.va_size)
973 sfv_len = va.va_size - sfv_off;
975 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
976 error = snf_segmap(fp, readvp, sfv_off,
977 (uoff_t)sfv_len, (ssize_t *)&cnt,
978 nowait);
979 releasef(sfv->sfv_fd);
980 *count += cnt;
981 if (error)
982 return (error);
983 sfv++;
984 continue;
987 while (sfv_len > 0) {
988 size_t iov_len;
990 iov_len = MIN(size, sfv_len);
992 if (vp->v_type == VSOCK) {
993 dmp = allocb(iov_len + extra, BPRI_HI);
994 if (dmp == NULL) {
995 fop_rwunlock(readvp,
996 V_WRITELOCK_FALSE, NULL);
997 releasef(sfv->sfv_fd);
998 return (ENOMEM);
1000 dmp->b_wptr = dmp->b_rptr =
1001 dmp->b_rptr + wroff;
1002 ptr = (caddr_t)dmp->b_rptr;
1003 } else {
1004 ptr = buf;
1007 aiov.iov_base = ptr;
1008 aiov.iov_len = iov_len;
1009 auio.uio_loffset = sfv_off;
1010 auio.uio_iov = &aiov;
1011 auio.uio_iovcnt = 1;
1012 auio.uio_resid = iov_len;
1013 auio.uio_segflg = UIO_SYSSPACE;
1014 auio.uio_llimit = MAXOFFSET_T;
1015 auio.uio_fmode = ffp->f_flag;
1016 ioflag = auio.uio_fmode &
1017 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1020 * If read sync is not asked for,
1021 * filter sync flags
1023 if ((ioflag & FRSYNC) == 0)
1024 ioflag &= ~(FSYNC|FDSYNC);
1025 error = fop_read(readvp, &auio, ioflag,
1026 fp->f_cred, NULL);
1027 if (error != 0) {
1029 * If we were reading a pipe (currently
1030 * not implemented), we may now lose
1031 * data.
1033 if (vp->v_type == VSOCK)
1034 freeb(dmp);
1035 else
1036 kmem_free(buf, size);
1037 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
1038 NULL);
1039 releasef(sfv->sfv_fd);
1040 return (error);
1044 * Check how much data was really read.
1045 * Decrement the 'len' and increment the
1046 * 'off' appropriately.
1048 cnt = iov_len - auio.uio_resid;
1049 if (cnt == 0) {
1050 if (vp->v_type == VSOCK)
1051 freeb(dmp);
1052 else
1053 kmem_free(buf, size);
1054 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
1055 NULL);
1056 releasef(sfv->sfv_fd);
1057 return (EINVAL);
1059 sfv_len -= cnt;
1060 sfv_off += cnt;
1062 if (vp->v_type == VSOCK) {
1063 dmp->b_wptr = dmp->b_rptr + cnt;
1065 error = socket_sendmblk(VTOSO(vp),
1066 &msg, fflag, CRED(), &dmp);
1068 if (error != 0) {
1069 if (dmp != NULL)
1070 freeb(dmp);
1071 fop_rwunlock(readvp,
1072 V_WRITELOCK_FALSE, NULL);
1073 releasef(sfv->sfv_fd);
1074 return (error);
1077 ttolwp(curthread)->lwp_ru.ioch +=
1078 (ulong_t)cnt;
1079 *count += cnt;
1080 } else {
1082 aiov.iov_base = ptr;
1083 aiov.iov_len = cnt;
1084 auio.uio_loffset = *fileoff;
1085 auio.uio_resid = cnt;
1086 auio.uio_iov = &aiov;
1087 auio.uio_iovcnt = 1;
1088 auio.uio_segflg = UIO_SYSSPACE;
1089 auio.uio_llimit = curproc->p_fsz_ctl;
1090 auio.uio_fmode = fflag;
1091 ioflag = auio.uio_fmode &
1092 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1093 error = fop_write(vp, &auio, ioflag,
1094 fp->f_cred, NULL);
1097 * Check how much data was written.
1098 * Increment the 'len' and decrement the
1099 * 'off' if all the data was not
1100 * written.
1102 cnt -= auio.uio_resid;
1103 sfv_len += auio.uio_resid;
1104 sfv_off -= auio.uio_resid;
1105 ttolwp(curthread)->lwp_ru.ioch +=
1106 (ulong_t)cnt;
1107 *fileoff += cnt;
1108 *count += cnt;
1109 if (error != 0) {
1110 kmem_free(buf, size);
1111 fop_rwunlock(readvp,
1112 V_WRITELOCK_FALSE, NULL);
1113 releasef(sfv->sfv_fd);
1114 return (error);
1118 if (buf) {
1119 kmem_free(buf, size);
1120 buf = NULL;
1122 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
1123 releasef(sfv->sfv_fd);
1125 sfv++;
1127 return (0);
1130 ssize_t
1131 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1132 size_t *xferred)
1134 int error = 0;
1135 int first_vector_error = 0;
1136 file_t *fp;
1137 struct vnode *vp;
1138 struct sonode *so;
1139 uoff_t fileoff;
1140 int copy_cnt;
1141 const struct sendfilevec *copy_vec;
1142 struct sendfilevec sfv[SEND_MAX_CHUNK];
1143 ssize_t count = 0;
1144 #ifdef _SYSCALL32_IMPL
1145 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1146 #endif
1147 ssize_t total_size;
1148 int i;
1149 boolean_t is_sock = B_FALSE;
1150 int maxblk = 0;
1152 if (sfvcnt <= 0)
1153 return (set_errno(EINVAL));
1155 if ((fp = getf(fildes)) == NULL)
1156 return (set_errno(EBADF));
1158 if (((fp->f_flag) & FWRITE) == 0) {
1159 error = EBADF;
1160 goto err;
1163 fileoff = fp->f_offset;
1164 vp = fp->f_vnode;
1166 switch (vp->v_type) {
1167 case VSOCK:
1168 so = VTOSO(vp);
1169 is_sock = B_TRUE;
1170 if (SOCK_IS_NONSTR(so)) {
1171 maxblk = so->so_proto_props.sopp_maxblk;
1172 } else {
1173 maxblk = (int)vp->v_stream->sd_maxblk;
1177 * We need to make sure that the socket that we're sending on
1178 * supports sendfile behavior. sockfs doesn't know that the APIs
1179 * we want to use are coming from sendfile, so we can't rely on
1180 * it to check for us.
1182 if ((so->so_mode & SM_SENDFILESUPP) == 0) {
1183 error = EOPNOTSUPP;
1184 goto err;
1186 break;
1187 case VREG:
1188 break;
1189 default:
1190 error = EINVAL;
1191 goto err;
1194 switch (opcode) {
1195 case SENDFILEV :
1196 break;
1197 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1198 case SENDFILEV64 :
1199 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1200 (size32_t *)xferred, fildes));
1201 #endif
1202 default :
1203 error = ENOSYS;
1204 break;
1207 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
1208 copy_vec = vec;
1210 do {
1211 total_size = 0;
1212 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1213 #ifdef _SYSCALL32_IMPL
1214 /* 32-bit callers need to have their iovec expanded. */
1215 if (get_udatamodel() == DATAMODEL_ILP32) {
1216 if (copyin(copy_vec, sfv32,
1217 copy_cnt * sizeof (ksendfilevec32_t))) {
1218 error = EFAULT;
1219 break;
1222 for (i = 0; i < copy_cnt; i++) {
1223 sfv[i].sfv_fd = sfv32[i].sfv_fd;
1224 sfv[i].sfv_off =
1225 (off_t)(uint32_t)sfv32[i].sfv_off;
1226 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1227 total_size += sfv[i].sfv_len;
1228 sfv[i].sfv_flag = sfv32[i].sfv_flag;
1230 * Individual elements of the vector must not
1231 * wrap or overflow, as later math is signed.
1232 * Equally total_size needs to be checked after
1233 * each vector is added in, to be sure that
1234 * rogue values haven't overflowed the counter.
1236 if (((ssize32_t)sfv[i].sfv_len < 0) ||
1237 ((ssize32_t)total_size < 0)) {
1239 * Truncate the vector to send data
1240 * described by elements before the
1241 * error.
1243 copy_cnt = i;
1244 first_vector_error = EINVAL;
1245 /* total_size can't be trusted */
1246 if ((ssize32_t)total_size < 0)
1247 error = EINVAL;
1248 break;
1251 /* Nothing to do, process errors */
1252 if (copy_cnt == 0)
1253 break;
1255 } else {
1256 #endif
1257 if (copyin(copy_vec, sfv,
1258 copy_cnt * sizeof (sendfilevec_t))) {
1259 error = EFAULT;
1260 break;
1263 for (i = 0; i < copy_cnt; i++) {
1264 total_size += sfv[i].sfv_len;
1266 * Individual elements of the vector must not
1267 * wrap or overflow, as later math is signed.
1268 * Equally total_size needs to be checked after
1269 * each vector is added in, to be sure that
1270 * rogue values haven't overflowed the counter.
1272 if (((ssize_t)sfv[i].sfv_len < 0) ||
1273 (total_size < 0)) {
1275 * Truncate the vector to send data
1276 * described by elements before the
1277 * error.
1279 copy_cnt = i;
1280 first_vector_error = EINVAL;
1281 /* total_size can't be trusted */
1282 if (total_size < 0)
1283 error = EINVAL;
1284 break;
1287 /* Nothing to do, process errors */
1288 if (copy_cnt == 0)
1289 break;
1290 #ifdef _SYSCALL32_IMPL
1292 #endif
1295 * The task between deciding to use sendvec_small_chunk
1296 * and sendvec_chunk is dependant on multiple things:
1298 * i) latency is important for smaller files. So if the
1299 * data is smaller than 'tcp_slow_start_initial' times
1300 * maxblk, then use sendvec_small_chunk which creates
1301 * maxblk size mblks and chains them together and sends
1302 * them to TCP in one shot. It also leaves 'wroff' size
1303 * space for the headers in each mblk.
1305 * ii) for total size bigger than 'tcp_slow_start_initial'
1306 * time maxblk, its probably real file data which is
1307 * dominating. So its better to use sendvec_chunk because
1308 * performance goes to dog if we don't do pagesize reads.
1309 * sendvec_chunk will do pagesize reads and write them
1310 * in pagesize mblks to TCP.
1312 * Side Notes: A write to file has not been optimized.
1313 * Future zero copy code will plugin into sendvec_chunk
1314 * only because doing zero copy for files smaller then
1315 * pagesize is useless.
1317 if (is_sock) {
1318 if ((total_size <= (4 * maxblk)) &&
1319 error == 0) {
1320 error = sendvec_small_chunk(fp,
1321 &fileoff, sfv, copy_cnt,
1322 total_size, maxblk, &count);
1323 } else {
1324 error = sendvec_chunk(fp, &fileoff,
1325 sfv, copy_cnt, &count);
1327 } else {
1328 ASSERT(vp->v_type == VREG);
1329 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1330 &count);
1334 #ifdef _SYSCALL32_IMPL
1335 if (get_udatamodel() == DATAMODEL_ILP32) {
1336 copy_vec = (const struct sendfilevec *)
1337 ((char *)copy_vec +
1338 (copy_cnt * sizeof (ksendfilevec32_t)));
1339 } else
1340 #endif
1341 copy_vec += copy_cnt;
1342 sfvcnt -= copy_cnt;
1344 /* Process all vector members up to first error */
1345 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1347 if (vp->v_type == VREG)
1348 fp->f_offset += count;
1350 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
1352 #ifdef _SYSCALL32_IMPL
1353 if (get_udatamodel() == DATAMODEL_ILP32) {
1354 ssize32_t count32 = (ssize32_t)count;
1355 if (copyout(&count32, xferred, sizeof (count32)))
1356 error = EFAULT;
1357 releasef(fildes);
1358 if (error != 0)
1359 return (set_errno(error));
1360 if (first_vector_error != 0)
1361 return (set_errno(first_vector_error));
1362 return (count32);
1364 #endif
1365 if (copyout(&count, xferred, sizeof (count)))
1366 error = EFAULT;
1367 releasef(fildes);
1368 if (error != 0)
1369 return (set_errno(error));
1370 if (first_vector_error != 0)
1371 return (set_errno(first_vector_error));
1372 return (count);
1373 err:
1374 ASSERT(error != 0);
1375 releasef(fildes);
1376 return (set_errno(error));