9075 Improve ZFS pool import/load process and corrupted pool recovery
[unleashed.git] / usr / src / uts / common / syscall / sendfile.c
blob822f14416cfe4bc4d1f0f388556ade881f5d8a43
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/buf.h>
31 #include <sys/conf.h>
32 #include <sys/cred.h>
33 #include <sys/kmem.h>
34 #include <sys/sysmacros.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/time.h>
40 #include <sys/file.h>
41 #include <sys/open.h>
42 #include <sys/user.h>
43 #include <sys/termios.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/sunddi.h>
47 #include <sys/esunddi.h>
48 #include <sys/flock.h>
49 #include <sys/modctl.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vmsystm.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <fs/sockfs/sockcommon.h>
56 #include <fs/sockfs/socktpi.h>
58 #include <netinet/in.h>
59 #include <sys/sendfile.h>
60 #include <sys/un.h>
61 #include <sys/tihdr.h>
62 #include <sys/atomic.h>
64 #include <inet/common.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/tcp.h>
69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
70 ssize32_t *);
71 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
72 int, ssize_t *);
73 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
74 boolean_t);
75 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
77 #define SEND_MAX_CHUNK 16
79 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
81 * 64 bit offsets for 32 bit applications only running either on
82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
83 * more than 2GB of data.
85 int
86 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
87 int copy_cnt, ssize32_t *count)
89 struct vnode *vp;
90 ushort_t fflag;
91 int ioflag;
92 size32_t cnt;
93 ssize32_t sfv_len;
94 ssize32_t tmpcount;
95 u_offset_t sfv_off;
96 struct uio auio;
97 struct iovec aiov;
98 int i, error;
100 fflag = fp->f_flag;
101 vp = fp->f_vnode;
102 for (i = 0; i < copy_cnt; i++) {
104 if (ISSIG(curthread, JUSTLOOKING))
105 return (EINTR);
108 * Do similar checks as "write" as we are writing
109 * sfv_len bytes into "vp".
111 sfv_len = (ssize32_t)sfv->sfv_len;
113 if (sfv_len == 0) {
114 sfv++;
115 continue;
118 if (sfv_len < 0)
119 return (EINVAL);
121 if (vp->v_type == VREG) {
122 if (*fileoff >= curproc->p_fsz_ctl) {
123 mutex_enter(&curproc->p_lock);
124 (void) rctl_action(
125 rctlproc_legacy[RLIMIT_FSIZE],
126 curproc->p_rctls, curproc, RCA_SAFE);
127 mutex_exit(&curproc->p_lock);
128 return (EFBIG);
131 if (*fileoff >= OFFSET_MAX(fp))
132 return (EFBIG);
134 if (*fileoff + sfv_len > OFFSET_MAX(fp))
135 return (EINVAL);
138 tmpcount = *count + sfv_len;
139 if (tmpcount < 0)
140 return (EINVAL);
142 sfv_off = sfv->sfv_off;
144 auio.uio_extflg = UIO_COPY_DEFAULT;
145 if (sfv->sfv_fd == SFV_FD_SELF) {
146 aiov.iov_len = sfv_len;
147 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
148 auio.uio_loffset = *fileoff;
149 auio.uio_iovcnt = 1;
150 auio.uio_resid = sfv_len;
151 auio.uio_iov = &aiov;
152 auio.uio_segflg = UIO_USERSPACE;
153 auio.uio_llimit = curproc->p_fsz_ctl;
154 auio.uio_fmode = fflag;
155 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
156 while (sfv_len > 0) {
157 error = VOP_WRITE(vp, &auio, ioflag,
158 fp->f_cred, NULL);
159 cnt = sfv_len - auio.uio_resid;
160 sfv_len -= cnt;
161 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
162 if (vp->v_type == VREG)
163 *fileoff += cnt;
164 *count += cnt;
165 if (error != 0)
166 return (error);
168 } else {
169 file_t *ffp;
170 vnode_t *readvp;
171 size_t size;
172 caddr_t ptr;
174 if ((ffp = getf(sfv->sfv_fd)) == NULL)
175 return (EBADF);
177 if ((ffp->f_flag & FREAD) == 0) {
178 releasef(sfv->sfv_fd);
179 return (EBADF);
182 readvp = ffp->f_vnode;
183 if (readvp->v_type != VREG) {
184 releasef(sfv->sfv_fd);
185 return (EINVAL);
189 * No point reading and writing to same vp,
190 * as long as both are regular files. readvp is not
191 * locked; but since we got it from an open file the
192 * contents will be valid during the time of access.
194 if (vn_compare(vp, readvp)) {
195 releasef(sfv->sfv_fd);
196 return (EINVAL);
200 * Optimize the regular file over
201 * the socket case.
203 if (vp->v_type == VSOCK) {
204 error = sosendfile64(fp, ffp, sfv,
205 (ssize32_t *)&cnt);
206 *count += cnt;
207 if (error)
208 return (error);
209 sfv++;
210 continue;
214 * Note: we assume readvp != vp. "vp" is already
215 * locked, and "readvp" must not be.
217 if (readvp < vp) {
218 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
219 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
220 NULL);
221 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
222 } else {
223 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
224 NULL);
228 * Same checks as in pread64.
230 if (sfv_off > MAXOFFSET_T) {
231 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
232 releasef(sfv->sfv_fd);
233 return (EINVAL);
236 if (sfv_off + sfv_len > MAXOFFSET_T)
237 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
239 /* Find the native blocksize to transfer data */
240 size = MIN(vp->v_vfsp->vfs_bsize,
241 readvp->v_vfsp->vfs_bsize);
242 size = sfv_len < size ? sfv_len : size;
243 ptr = kmem_alloc(size, KM_NOSLEEP);
244 if (ptr == NULL) {
245 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
246 releasef(sfv->sfv_fd);
247 return (ENOMEM);
250 while (sfv_len > 0) {
251 size_t iov_len;
253 iov_len = MIN(size, sfv_len);
254 aiov.iov_base = ptr;
255 aiov.iov_len = iov_len;
256 auio.uio_loffset = sfv_off;
257 auio.uio_iov = &aiov;
258 auio.uio_iovcnt = 1;
259 auio.uio_resid = iov_len;
260 auio.uio_segflg = UIO_SYSSPACE;
261 auio.uio_llimit = MAXOFFSET_T;
262 auio.uio_fmode = ffp->f_flag;
263 ioflag = auio.uio_fmode &
264 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
267 * If read sync is not asked for,
268 * filter sync flags
270 if ((ioflag & FRSYNC) == 0)
271 ioflag &= ~(FSYNC|FDSYNC);
272 error = VOP_READ(readvp, &auio, ioflag,
273 fp->f_cred, NULL);
274 if (error) {
275 kmem_free(ptr, size);
276 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
277 NULL);
278 releasef(sfv->sfv_fd);
279 return (error);
283 * Check how must data was really read.
284 * Decrement the 'len' and increment the
285 * 'off' appropriately.
287 cnt = iov_len - auio.uio_resid;
288 if (cnt == 0) {
290 * If we were reading a pipe (currently
291 * not implemented), we may now lose
292 * data.
294 kmem_free(ptr, size);
295 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
296 NULL);
297 releasef(sfv->sfv_fd);
298 return (EINVAL);
300 sfv_len -= cnt;
301 sfv_off += cnt;
303 aiov.iov_base = ptr;
304 aiov.iov_len = cnt;
305 auio.uio_loffset = *fileoff;
306 auio.uio_iov = &aiov;
307 auio.uio_iovcnt = 1;
308 auio.uio_resid = cnt;
309 auio.uio_segflg = UIO_SYSSPACE;
310 auio.uio_llimit = curproc->p_fsz_ctl;
311 auio.uio_fmode = fflag;
312 ioflag = auio.uio_fmode &
313 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
314 error = VOP_WRITE(vp, &auio, ioflag,
315 fp->f_cred, NULL);
318 * Check how much data was written. Increment
319 * the 'len' and decrement the 'off' if all
320 * the data was not written.
322 cnt -= auio.uio_resid;
323 sfv_len += auio.uio_resid;
324 sfv_off -= auio.uio_resid;
325 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
326 if (vp->v_type == VREG)
327 *fileoff += cnt;
328 *count += cnt;
329 if (error != 0) {
330 kmem_free(ptr, size);
331 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
332 NULL);
333 releasef(sfv->sfv_fd);
334 return (error);
337 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
338 releasef(sfv->sfv_fd);
339 kmem_free(ptr, size);
341 sfv++;
343 return (0);
346 ssize32_t
347 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
348 size32_t *xferred, int fildes)
350 u_offset_t fileoff;
351 int copy_cnt;
352 const struct ksendfilevec64 *copy_vec;
353 struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
354 struct vnode *vp;
355 int error;
356 ssize32_t count = 0;
358 vp = fp->f_vnode;
359 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
361 copy_vec = vec;
362 fileoff = fp->f_offset;
364 do {
365 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
366 if (copyin(copy_vec, sfv, copy_cnt *
367 sizeof (struct ksendfilevec64))) {
368 error = EFAULT;
369 break;
372 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
373 if (error != 0)
374 break;
376 copy_vec += copy_cnt;
377 sfvcnt -= copy_cnt;
378 } while (sfvcnt > 0);
380 if (vp->v_type == VREG)
381 fp->f_offset += count;
383 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
384 if (copyout(&count, xferred, sizeof (count)))
385 error = EFAULT;
386 releasef(fildes);
387 if (error != 0)
388 return (set_errno(error));
389 return (count);
391 #endif
394 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
395 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
397 struct vnode *vp;
398 struct uio auio;
399 struct iovec aiov;
400 ushort_t fflag;
401 int ioflag;
402 int i, error;
403 size_t cnt;
404 ssize_t sfv_len;
405 u_offset_t sfv_off;
406 #ifdef _SYSCALL32_IMPL
407 model_t model = get_udatamodel();
408 u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
409 MAXOFF32_T : MAXOFFSET_T;
410 #else
411 const u_offset_t maxoff = MAXOFF32_T;
412 #endif
413 mblk_t *dmp = NULL;
414 int wroff;
415 int buf_left = 0;
416 size_t iov_len;
417 mblk_t *head, *tmp;
418 size_t size = total_size;
419 size_t extra;
420 int tail_len;
421 struct nmsghdr msg;
423 fflag = fp->f_flag;
424 vp = fp->f_vnode;
426 ASSERT(vp->v_type == VSOCK);
427 ASSERT(maxblk > 0);
429 /* If nothing to send, return */
430 if (total_size == 0)
431 return (0);
433 if (vp->v_stream != NULL) {
434 wroff = (int)vp->v_stream->sd_wroff;
435 tail_len = (int)vp->v_stream->sd_tail;
436 } else {
437 struct sonode *so;
439 so = VTOSO(vp);
440 wroff = so->so_proto_props.sopp_wroff;
441 tail_len = so->so_proto_props.sopp_tail;
444 extra = wroff + tail_len;
446 buf_left = MIN(total_size, maxblk);
447 head = dmp = allocb(buf_left + extra, BPRI_HI);
448 if (head == NULL)
449 return (ENOMEM);
450 head->b_wptr = head->b_rptr = head->b_rptr + wroff;
451 bzero(&msg, sizeof (msg));
453 auio.uio_extflg = UIO_COPY_DEFAULT;
454 for (i = 0; i < copy_cnt; i++) {
455 if (ISSIG(curthread, JUSTLOOKING)) {
456 freemsg(head);
457 return (EINTR);
461 * Do similar checks as "write" as we are writing
462 * sfv_len bytes into "vp".
464 sfv_len = (ssize_t)sfv->sfv_len;
466 if (sfv_len == 0) {
467 sfv++;
468 continue;
471 /* Check for overflow */
472 #ifdef _SYSCALL32_IMPL
473 if (model == DATAMODEL_ILP32) {
474 if (((ssize32_t)(*count + sfv_len)) < 0) {
475 freemsg(head);
476 return (EINVAL);
478 } else
479 #endif
480 if ((*count + sfv_len) < 0) {
481 freemsg(head);
482 return (EINVAL);
485 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
487 if (sfv->sfv_fd == SFV_FD_SELF) {
488 while (sfv_len > 0) {
489 if (buf_left == 0) {
490 tmp = dmp;
491 buf_left = MIN(total_size, maxblk);
492 iov_len = MIN(buf_left, sfv_len);
493 dmp = allocb(buf_left + extra, BPRI_HI);
494 if (dmp == NULL) {
495 freemsg(head);
496 return (ENOMEM);
498 dmp->b_wptr = dmp->b_rptr =
499 dmp->b_rptr + wroff;
500 tmp->b_cont = dmp;
501 } else {
502 iov_len = MIN(buf_left, sfv_len);
505 aiov.iov_len = iov_len;
506 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
507 auio.uio_loffset = *fileoff;
508 auio.uio_iovcnt = 1;
509 auio.uio_resid = iov_len;
510 auio.uio_iov = &aiov;
511 auio.uio_segflg = UIO_USERSPACE;
512 auio.uio_llimit = curproc->p_fsz_ctl;
513 auio.uio_fmode = fflag;
515 buf_left -= iov_len;
516 total_size -= iov_len;
517 sfv_len -= iov_len;
518 sfv_off += iov_len;
520 error = uiomove((caddr_t)dmp->b_wptr,
521 iov_len, UIO_WRITE, &auio);
522 if (error != 0) {
523 freemsg(head);
524 return (error);
526 dmp->b_wptr += iov_len;
528 } else {
529 file_t *ffp;
530 vnode_t *readvp;
532 if ((ffp = getf(sfv->sfv_fd)) == NULL) {
533 freemsg(head);
534 return (EBADF);
537 if ((ffp->f_flag & FREAD) == 0) {
538 releasef(sfv->sfv_fd);
539 freemsg(head);
540 return (EACCES);
543 readvp = ffp->f_vnode;
544 if (readvp->v_type != VREG) {
545 releasef(sfv->sfv_fd);
546 freemsg(head);
547 return (EINVAL);
551 * No point reading and writing to same vp,
552 * as long as both are regular files. readvp is not
553 * locked; but since we got it from an open file the
554 * contents will be valid during the time of access.
557 if (vn_compare(vp, readvp)) {
558 releasef(sfv->sfv_fd);
559 freemsg(head);
560 return (EINVAL);
564 * Note: we assume readvp != vp. "vp" is already
565 * locked, and "readvp" must not be.
568 if (readvp < vp) {
569 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
570 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
571 NULL);
572 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
573 } else {
574 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
575 NULL);
578 /* Same checks as in pread */
579 if (sfv_off > maxoff) {
580 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
581 releasef(sfv->sfv_fd);
582 freemsg(head);
583 return (EINVAL);
585 if (sfv_off + sfv_len > maxoff) {
586 total_size -= (sfv_off + sfv_len - maxoff);
587 sfv_len = (ssize_t)((offset_t)maxoff -
588 sfv_off);
591 while (sfv_len > 0) {
592 if (buf_left == 0) {
593 tmp = dmp;
594 buf_left = MIN(total_size, maxblk);
595 iov_len = MIN(buf_left, sfv_len);
596 dmp = allocb(buf_left + extra, BPRI_HI);
597 if (dmp == NULL) {
598 VOP_RWUNLOCK(readvp,
599 V_WRITELOCK_FALSE, NULL);
600 releasef(sfv->sfv_fd);
601 freemsg(head);
602 return (ENOMEM);
604 dmp->b_wptr = dmp->b_rptr =
605 dmp->b_rptr + wroff;
606 tmp->b_cont = dmp;
607 } else {
608 iov_len = MIN(buf_left, sfv_len);
610 aiov.iov_base = (caddr_t)dmp->b_wptr;
611 aiov.iov_len = iov_len;
612 auio.uio_loffset = sfv_off;
613 auio.uio_iov = &aiov;
614 auio.uio_iovcnt = 1;
615 auio.uio_resid = iov_len;
616 auio.uio_segflg = UIO_SYSSPACE;
617 auio.uio_llimit = MAXOFFSET_T;
618 auio.uio_fmode = ffp->f_flag;
619 ioflag = auio.uio_fmode &
620 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
623 * If read sync is not asked for,
624 * filter sync flags
626 if ((ioflag & FRSYNC) == 0)
627 ioflag &= ~(FSYNC|FDSYNC);
628 error = VOP_READ(readvp, &auio, ioflag,
629 fp->f_cred, NULL);
630 if (error != 0) {
632 * If we were reading a pipe (currently
633 * not implemented), we may now loose
634 * data.
636 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
637 NULL);
638 releasef(sfv->sfv_fd);
639 freemsg(head);
640 return (error);
644 * Check how much data was really read.
645 * Decrement the 'len' and increment the
646 * 'off' appropriately.
648 cnt = iov_len - auio.uio_resid;
649 if (cnt == 0) {
650 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
651 NULL);
652 releasef(sfv->sfv_fd);
653 freemsg(head);
654 return (EINVAL);
656 sfv_len -= cnt;
657 sfv_off += cnt;
658 total_size -= cnt;
659 buf_left -= cnt;
661 dmp->b_wptr += cnt;
663 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
664 releasef(sfv->sfv_fd);
666 sfv++;
669 ASSERT(total_size == 0);
670 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
671 if (error != 0) {
672 if (head != NULL)
673 freemsg(head);
674 return (error);
676 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
677 *count += size;
679 return (0);
684 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
685 int copy_cnt, ssize_t *count)
687 struct vnode *vp;
688 struct uio auio;
689 struct iovec aiov;
690 ushort_t fflag;
691 int ioflag;
692 int i, error;
693 size_t cnt;
694 ssize_t sfv_len;
695 u_offset_t sfv_off;
696 #ifdef _SYSCALL32_IMPL
697 model_t model = get_udatamodel();
698 u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
699 MAXOFF32_T : MAXOFFSET_T;
700 #else
701 const u_offset_t maxoff = MAXOFF32_T;
702 #endif
703 mblk_t *dmp = NULL;
704 char *buf = NULL;
705 size_t extra;
706 int maxblk, wroff, tail_len;
707 struct sonode *so;
708 stdata_t *stp;
709 struct nmsghdr msg;
711 fflag = fp->f_flag;
712 vp = fp->f_vnode;
714 if (vp->v_type == VSOCK) {
715 so = VTOSO(vp);
716 if (vp->v_stream != NULL) {
717 stp = vp->v_stream;
718 wroff = (int)stp->sd_wroff;
719 tail_len = (int)stp->sd_tail;
720 maxblk = (int)stp->sd_maxblk;
721 } else {
722 stp = NULL;
723 wroff = so->so_proto_props.sopp_wroff;
724 tail_len = so->so_proto_props.sopp_tail;
725 maxblk = so->so_proto_props.sopp_maxblk;
727 extra = wroff + tail_len;
730 bzero(&msg, sizeof (msg));
731 auio.uio_extflg = UIO_COPY_DEFAULT;
732 for (i = 0; i < copy_cnt; i++) {
733 if (ISSIG(curthread, JUSTLOOKING))
734 return (EINTR);
737 * Do similar checks as "write" as we are writing
738 * sfv_len bytes into "vp".
740 sfv_len = (ssize_t)sfv->sfv_len;
742 if (sfv_len == 0) {
743 sfv++;
744 continue;
747 if (vp->v_type == VREG) {
748 if (*fileoff >= curproc->p_fsz_ctl) {
749 mutex_enter(&curproc->p_lock);
750 (void) rctl_action(
751 rctlproc_legacy[RLIMIT_FSIZE],
752 curproc->p_rctls, curproc, RCA_SAFE);
753 mutex_exit(&curproc->p_lock);
755 return (EFBIG);
758 if (*fileoff >= maxoff)
759 return (EFBIG);
761 if (*fileoff + sfv_len > maxoff)
762 return (EINVAL);
765 /* Check for overflow */
766 #ifdef _SYSCALL32_IMPL
767 if (model == DATAMODEL_ILP32) {
768 if (((ssize32_t)(*count + sfv_len)) < 0)
769 return (EINVAL);
770 } else
771 #endif
772 if ((*count + sfv_len) < 0)
773 return (EINVAL);
775 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
777 if (sfv->sfv_fd == SFV_FD_SELF) {
778 if (vp->v_type == VSOCK) {
779 while (sfv_len > 0) {
780 size_t iov_len;
782 iov_len = sfv_len;
784 * Socket filters can limit the mblk
785 * size, so limit reads to maxblk if
786 * there are filters present.
788 if (so->so_filter_active > 0 &&
789 maxblk != INFPSZ)
790 iov_len = MIN(iov_len, maxblk);
792 aiov.iov_len = iov_len;
793 aiov.iov_base =
794 (caddr_t)(uintptr_t)sfv_off;
796 auio.uio_iov = &aiov;
797 auio.uio_iovcnt = 1;
798 auio.uio_loffset = *fileoff;
799 auio.uio_segflg = UIO_USERSPACE;
800 auio.uio_fmode = fflag;
801 auio.uio_llimit = curproc->p_fsz_ctl;
802 auio.uio_resid = iov_len;
804 dmp = allocb(iov_len + extra, BPRI_HI);
805 if (dmp == NULL)
806 return (ENOMEM);
807 dmp->b_wptr = dmp->b_rptr =
808 dmp->b_rptr + wroff;
809 error = uiomove((caddr_t)dmp->b_wptr,
810 iov_len, UIO_WRITE, &auio);
811 if (error != 0) {
812 freeb(dmp);
813 return (error);
815 dmp->b_wptr += iov_len;
816 error = socket_sendmblk(VTOSO(vp),
817 &msg, fflag, CRED(), &dmp);
819 if (error != 0) {
820 if (dmp != NULL)
821 freeb(dmp);
822 return (error);
824 ttolwp(curthread)->lwp_ru.ioch +=
825 (ulong_t)iov_len;
826 *count += iov_len;
827 sfv_len -= iov_len;
828 sfv_off += iov_len;
830 } else {
831 aiov.iov_len = sfv_len;
832 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
834 auio.uio_iov = &aiov;
835 auio.uio_iovcnt = 1;
836 auio.uio_loffset = *fileoff;
837 auio.uio_segflg = UIO_USERSPACE;
838 auio.uio_fmode = fflag;
839 auio.uio_llimit = curproc->p_fsz_ctl;
840 auio.uio_resid = sfv_len;
842 ioflag = auio.uio_fmode &
843 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
844 while (sfv_len > 0) {
845 error = VOP_WRITE(vp, &auio, ioflag,
846 fp->f_cred, NULL);
847 cnt = sfv_len - auio.uio_resid;
848 sfv_len -= cnt;
849 ttolwp(curthread)->lwp_ru.ioch +=
850 (ulong_t)cnt;
851 *fileoff += cnt;
852 *count += cnt;
853 if (error != 0)
854 return (error);
857 } else {
858 int segmapit = 0;
859 file_t *ffp;
860 vnode_t *readvp;
861 struct vnode *realvp;
862 size_t size;
863 caddr_t ptr;
865 if ((ffp = getf(sfv->sfv_fd)) == NULL)
866 return (EBADF);
868 if ((ffp->f_flag & FREAD) == 0) {
869 releasef(sfv->sfv_fd);
870 return (EBADF);
873 readvp = ffp->f_vnode;
874 if (VOP_REALVP(readvp, &realvp, NULL) == 0)
875 readvp = realvp;
876 if (readvp->v_type != VREG) {
877 releasef(sfv->sfv_fd);
878 return (EINVAL);
882 * No point reading and writing to same vp,
883 * as long as both are regular files. readvp is not
884 * locked; but since we got it from an open file the
885 * contents will be valid during the time of access.
887 if (vn_compare(vp, readvp)) {
888 releasef(sfv->sfv_fd);
889 return (EINVAL);
893 * Note: we assume readvp != vp. "vp" is already
894 * locked, and "readvp" must not be.
896 if (readvp < vp) {
897 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
898 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
899 NULL);
900 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
901 } else {
902 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
903 NULL);
906 /* Same checks as in pread */
907 if (sfv_off > maxoff) {
908 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
909 releasef(sfv->sfv_fd);
910 return (EINVAL);
912 if (sfv_off + sfv_len > maxoff) {
913 sfv_len = (ssize_t)((offset_t)maxoff -
914 sfv_off);
916 /* Find the native blocksize to transfer data */
917 size = MIN(vp->v_vfsp->vfs_bsize,
918 readvp->v_vfsp->vfs_bsize);
919 size = sfv_len < size ? sfv_len : size;
921 if (vp->v_type != VSOCK) {
922 segmapit = 0;
923 buf = kmem_alloc(size, KM_NOSLEEP);
924 if (buf == NULL) {
925 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
926 NULL);
927 releasef(sfv->sfv_fd);
928 return (ENOMEM);
930 } else {
931 uint_t copyflag;
933 copyflag = stp != NULL ? stp->sd_copyflag :
934 so->so_proto_props.sopp_zcopyflag;
937 * Socket filters can limit the mblk size,
938 * so limit reads to maxblk if there are
939 * filters present.
941 if (so->so_filter_active > 0 &&
942 maxblk != INFPSZ)
943 size = MIN(size, maxblk);
945 if (vn_has_flocks(readvp) ||
946 readvp->v_flag & VNOMAP ||
947 copyflag & STZCVMUNSAFE) {
948 segmapit = 0;
949 } else if (copyflag & STZCVMSAFE) {
950 segmapit = 1;
951 } else {
952 int on = 1;
953 if (socket_setsockopt(VTOSO(vp),
954 SOL_SOCKET, SO_SND_COPYAVOID,
955 &on, sizeof (on), CRED()) == 0)
956 segmapit = 1;
960 if (segmapit) {
961 boolean_t nowait;
963 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
964 error = snf_segmap(fp, readvp, sfv_off,
965 (u_offset_t)sfv_len, (ssize_t *)&cnt,
966 nowait);
967 releasef(sfv->sfv_fd);
968 *count += cnt;
969 if (error)
970 return (error);
971 sfv++;
972 continue;
975 while (sfv_len > 0) {
976 size_t iov_len;
978 iov_len = MIN(size, sfv_len);
980 if (vp->v_type == VSOCK) {
981 dmp = allocb(iov_len + extra, BPRI_HI);
982 if (dmp == NULL) {
983 VOP_RWUNLOCK(readvp,
984 V_WRITELOCK_FALSE, NULL);
985 releasef(sfv->sfv_fd);
986 return (ENOMEM);
988 dmp->b_wptr = dmp->b_rptr =
989 dmp->b_rptr + wroff;
990 ptr = (caddr_t)dmp->b_rptr;
991 } else {
992 ptr = buf;
995 aiov.iov_base = ptr;
996 aiov.iov_len = iov_len;
997 auio.uio_loffset = sfv_off;
998 auio.uio_iov = &aiov;
999 auio.uio_iovcnt = 1;
1000 auio.uio_resid = iov_len;
1001 auio.uio_segflg = UIO_SYSSPACE;
1002 auio.uio_llimit = MAXOFFSET_T;
1003 auio.uio_fmode = ffp->f_flag;
1004 ioflag = auio.uio_fmode &
1005 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1008 * If read sync is not asked for,
1009 * filter sync flags
1011 if ((ioflag & FRSYNC) == 0)
1012 ioflag &= ~(FSYNC|FDSYNC);
1013 error = VOP_READ(readvp, &auio, ioflag,
1014 fp->f_cred, NULL);
1015 if (error != 0) {
1017 * If we were reading a pipe (currently
1018 * not implemented), we may now lose
1019 * data.
1021 if (vp->v_type == VSOCK)
1022 freeb(dmp);
1023 else
1024 kmem_free(buf, size);
1025 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1026 NULL);
1027 releasef(sfv->sfv_fd);
1028 return (error);
1032 * Check how much data was really read.
1033 * Decrement the 'len' and increment the
1034 * 'off' appropriately.
1036 cnt = iov_len - auio.uio_resid;
1037 if (cnt == 0) {
1038 if (vp->v_type == VSOCK)
1039 freeb(dmp);
1040 else
1041 kmem_free(buf, size);
1042 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1043 NULL);
1044 releasef(sfv->sfv_fd);
1045 return (EINVAL);
1047 sfv_len -= cnt;
1048 sfv_off += cnt;
1050 if (vp->v_type == VSOCK) {
1051 dmp->b_wptr = dmp->b_rptr + cnt;
1053 error = socket_sendmblk(VTOSO(vp),
1054 &msg, fflag, CRED(), &dmp);
1056 if (error != 0) {
1057 if (dmp != NULL)
1058 freeb(dmp);
1059 VOP_RWUNLOCK(readvp,
1060 V_WRITELOCK_FALSE, NULL);
1061 releasef(sfv->sfv_fd);
1062 return (error);
1065 ttolwp(curthread)->lwp_ru.ioch +=
1066 (ulong_t)cnt;
1067 *count += cnt;
1068 } else {
1070 aiov.iov_base = ptr;
1071 aiov.iov_len = cnt;
1072 auio.uio_loffset = *fileoff;
1073 auio.uio_resid = cnt;
1074 auio.uio_iov = &aiov;
1075 auio.uio_iovcnt = 1;
1076 auio.uio_segflg = UIO_SYSSPACE;
1077 auio.uio_llimit = curproc->p_fsz_ctl;
1078 auio.uio_fmode = fflag;
1079 ioflag = auio.uio_fmode &
1080 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1081 error = VOP_WRITE(vp, &auio, ioflag,
1082 fp->f_cred, NULL);
1085 * Check how much data was written.
1086 * Increment the 'len' and decrement the
1087 * 'off' if all the data was not
1088 * written.
1090 cnt -= auio.uio_resid;
1091 sfv_len += auio.uio_resid;
1092 sfv_off -= auio.uio_resid;
1093 ttolwp(curthread)->lwp_ru.ioch +=
1094 (ulong_t)cnt;
1095 *fileoff += cnt;
1096 *count += cnt;
1097 if (error != 0) {
1098 kmem_free(buf, size);
1099 VOP_RWUNLOCK(readvp,
1100 V_WRITELOCK_FALSE, NULL);
1101 releasef(sfv->sfv_fd);
1102 return (error);
1106 if (buf) {
1107 kmem_free(buf, size);
1108 buf = NULL;
1110 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
1111 releasef(sfv->sfv_fd);
1113 sfv++;
1115 return (0);
1118 ssize_t
1119 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1120 size_t *xferred)
1122 int error = 0;
1123 int first_vector_error = 0;
1124 file_t *fp;
1125 struct vnode *vp;
1126 struct sonode *so;
1127 u_offset_t fileoff;
1128 int copy_cnt;
1129 const struct sendfilevec *copy_vec;
1130 struct sendfilevec sfv[SEND_MAX_CHUNK];
1131 ssize_t count = 0;
1132 #ifdef _SYSCALL32_IMPL
1133 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1134 #endif
1135 ssize_t total_size;
1136 int i;
1137 boolean_t is_sock = B_FALSE;
1138 int maxblk = 0;
1140 if (sfvcnt <= 0)
1141 return (set_errno(EINVAL));
1143 if ((fp = getf(fildes)) == NULL)
1144 return (set_errno(EBADF));
1146 if (((fp->f_flag) & FWRITE) == 0) {
1147 error = EBADF;
1148 goto err;
1151 fileoff = fp->f_offset;
1152 vp = fp->f_vnode;
1154 switch (vp->v_type) {
1155 case VSOCK:
1156 so = VTOSO(vp);
1157 is_sock = B_TRUE;
1158 if (SOCK_IS_NONSTR(so)) {
1159 maxblk = so->so_proto_props.sopp_maxblk;
1160 } else {
1161 maxblk = (int)vp->v_stream->sd_maxblk;
1163 break;
1164 case VREG:
1165 break;
1166 default:
1167 error = EINVAL;
1168 goto err;
1171 switch (opcode) {
1172 case SENDFILEV :
1173 break;
1174 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1175 case SENDFILEV64 :
1176 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1177 (size32_t *)xferred, fildes));
1178 #endif
1179 default :
1180 error = ENOSYS;
1181 break;
1184 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1185 copy_vec = vec;
1187 do {
1188 total_size = 0;
1189 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1190 #ifdef _SYSCALL32_IMPL
1191 /* 32-bit callers need to have their iovec expanded. */
1192 if (get_udatamodel() == DATAMODEL_ILP32) {
1193 if (copyin(copy_vec, sfv32,
1194 copy_cnt * sizeof (ksendfilevec32_t))) {
1195 error = EFAULT;
1196 break;
1199 for (i = 0; i < copy_cnt; i++) {
1200 sfv[i].sfv_fd = sfv32[i].sfv_fd;
1201 sfv[i].sfv_off =
1202 (off_t)(uint32_t)sfv32[i].sfv_off;
1203 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1204 total_size += sfv[i].sfv_len;
1205 sfv[i].sfv_flag = sfv32[i].sfv_flag;
1207 * Individual elements of the vector must not
1208 * wrap or overflow, as later math is signed.
1209 * Equally total_size needs to be checked after
1210 * each vector is added in, to be sure that
1211 * rogue values haven't overflowed the counter.
1213 if (((ssize32_t)sfv[i].sfv_len < 0) ||
1214 ((ssize32_t)total_size < 0)) {
1216 * Truncate the vector to send data
1217 * described by elements before the
1218 * error.
1220 copy_cnt = i;
1221 first_vector_error = EINVAL;
1222 /* total_size can't be trusted */
1223 if ((ssize32_t)total_size < 0)
1224 error = EINVAL;
1225 break;
1228 /* Nothing to do, process errors */
1229 if (copy_cnt == 0)
1230 break;
1232 } else {
1233 #endif
1234 if (copyin(copy_vec, sfv,
1235 copy_cnt * sizeof (sendfilevec_t))) {
1236 error = EFAULT;
1237 break;
1240 for (i = 0; i < copy_cnt; i++) {
1241 total_size += sfv[i].sfv_len;
1243 * Individual elements of the vector must not
1244 * wrap or overflow, as later math is signed.
1245 * Equally total_size needs to be checked after
1246 * each vector is added in, to be sure that
1247 * rogue values haven't overflowed the counter.
1249 if (((ssize_t)sfv[i].sfv_len < 0) ||
1250 (total_size < 0)) {
1252 * Truncate the vector to send data
1253 * described by elements before the
1254 * error.
1256 copy_cnt = i;
1257 first_vector_error = EINVAL;
1258 /* total_size can't be trusted */
1259 if (total_size < 0)
1260 error = EINVAL;
1261 break;
1264 /* Nothing to do, process errors */
1265 if (copy_cnt == 0)
1266 break;
1267 #ifdef _SYSCALL32_IMPL
1269 #endif
1272 * The task between deciding to use sendvec_small_chunk
1273 * and sendvec_chunk is dependant on multiple things:
1275 * i) latency is important for smaller files. So if the
1276 * data is smaller than 'tcp_slow_start_initial' times
1277 * maxblk, then use sendvec_small_chunk which creates
1278 * maxblk size mblks and chains them together and sends
1279 * them to TCP in one shot. It also leaves 'wroff' size
1280 * space for the headers in each mblk.
1282 * ii) for total size bigger than 'tcp_slow_start_initial'
1283 * time maxblk, its probably real file data which is
1284 * dominating. So its better to use sendvec_chunk because
1285 * performance goes to dog if we don't do pagesize reads.
1286 * sendvec_chunk will do pagesize reads and write them
1287 * in pagesize mblks to TCP.
1289 * Side Notes: A write to file has not been optimized.
1290 * Future zero copy code will plugin into sendvec_chunk
1291 * only because doing zero copy for files smaller then
1292 * pagesize is useless.
1294 * Note, if socket has NL7C enabled then call NL7C's
1295 * senfilev() function to consume the sfv[].
1297 if (is_sock) {
1298 if (!SOCK_IS_NONSTR(so) &&
1299 _SOTOTPI(so)->sti_nl7c_flags != 0) {
1300 error = nl7c_sendfilev(so, &fileoff,
1301 sfv, copy_cnt, &count);
1302 } else if ((total_size <= (4 * maxblk)) &&
1303 error == 0) {
1304 error = sendvec_small_chunk(fp,
1305 &fileoff, sfv, copy_cnt,
1306 total_size, maxblk, &count);
1307 } else {
1308 error = sendvec_chunk(fp, &fileoff,
1309 sfv, copy_cnt, &count);
1311 } else {
1312 ASSERT(vp->v_type == VREG);
1313 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1314 &count);
1318 #ifdef _SYSCALL32_IMPL
1319 if (get_udatamodel() == DATAMODEL_ILP32) {
1320 copy_vec = (const struct sendfilevec *)
1321 ((char *)copy_vec +
1322 (copy_cnt * sizeof (ksendfilevec32_t)));
1323 } else
1324 #endif
1325 copy_vec += copy_cnt;
1326 sfvcnt -= copy_cnt;
1328 /* Process all vector members up to first error */
1329 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1331 if (vp->v_type == VREG)
1332 fp->f_offset += count;
1334 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1336 #ifdef _SYSCALL32_IMPL
1337 if (get_udatamodel() == DATAMODEL_ILP32) {
1338 ssize32_t count32 = (ssize32_t)count;
1339 if (copyout(&count32, xferred, sizeof (count32)))
1340 error = EFAULT;
1341 releasef(fildes);
1342 if (error != 0)
1343 return (set_errno(error));
1344 if (first_vector_error != 0)
1345 return (set_errno(first_vector_error));
1346 return (count32);
1348 #endif
1349 if (copyout(&count, xferred, sizeof (count)))
1350 error = EFAULT;
1351 releasef(fildes);
1352 if (error != 0)
1353 return (set_errno(error));
1354 if (first_vector_error != 0)
1355 return (set_errno(first_vector_error));
1356 return (count);
1357 err:
1358 ASSERT(error != 0);
1359 releasef(fildes);
1360 return (set_errno(error));