2 * Copyright (c) 1982, 1986, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
46 #include <sys/limits.h>
48 #include <sys/mutex.h>
50 #include <sys/malloc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/sched.h>
53 #include <sys/sysctl.h>
54 #include <sys/vnode.h>
57 #include <vm/vm_page.h>
58 #include <vm/vm_map.h>
59 #ifdef ZERO_COPY_SOCKETS
60 #include <vm/vm_param.h>
61 #include <vm/vm_object.h>
64 SYSCTL_INT(_kern
, KERN_IOV_MAX
, iov_max
, CTLFLAG_RD
, NULL
, UIO_MAXIOV
,
65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
67 #ifdef ZERO_COPY_SOCKETS
68 /* Declared in uipc_socket.c */
69 extern int so_zero_copy_receive
;
72 * Identify the physical page mapped at the given kernel virtual
73 * address. Insert this physical page into the given address space at
74 * the given virtual address, replacing the physical page, if any,
75 * that already exists there.
78 vm_pgmoveco(vm_map_t mapa
, vm_offset_t kaddr
, vm_offset_t uaddr
)
81 vm_page_t kern_pg
, user_pg
;
88 KASSERT((uaddr
& PAGE_MASK
) == 0,
89 ("vm_pgmoveco: uaddr is not page aligned"));
92 * Herein the physical page is validated and dirtied. It is
93 * unwired in sf_buf_mext().
95 kern_pg
= PHYS_TO_VM_PAGE(vtophys(kaddr
));
96 kern_pg
->valid
= VM_PAGE_BITS_ALL
;
97 KASSERT(kern_pg
->queue
== PQ_NONE
&& kern_pg
->wire_count
== 1,
98 ("vm_pgmoveco: kern_pg is not correctly wired"));
100 if ((vm_map_lookup(&map
, uaddr
,
101 VM_PROT_WRITE
, &entry
, &uobject
,
102 &upindex
, &prot
, &wired
)) != KERN_SUCCESS
) {
105 VM_OBJECT_LOCK(uobject
);
107 if ((user_pg
= vm_page_lookup(uobject
, upindex
)) != NULL
) {
108 if (vm_page_sleep_if_busy(user_pg
, TRUE
, "vm_pgmoveco"))
110 vm_page_lock_queues();
111 pmap_remove_all(user_pg
);
112 vm_page_free(user_pg
);
115 * Even if a physical page does not exist in the
116 * object chain's first object, a physical page from a
117 * backing object may be mapped read only.
119 if (uobject
->backing_object
!= NULL
)
120 pmap_remove(map
->pmap
, uaddr
, uaddr
+ PAGE_SIZE
);
121 vm_page_lock_queues();
123 vm_page_insert(kern_pg
, uobject
, upindex
);
124 vm_page_dirty(kern_pg
);
125 vm_page_unlock_queues();
126 VM_OBJECT_UNLOCK(uobject
);
127 vm_map_lookup_done(map
, entry
);
128 return(KERN_SUCCESS
);
130 #endif /* ZERO_COPY_SOCKETS */
133 uiomove(void *cp
, int n
, struct uio
*uio
)
135 struct thread
*td
= curthread
;
141 KASSERT(uio
->uio_rw
== UIO_READ
|| uio
->uio_rw
== UIO_WRITE
,
143 KASSERT(uio
->uio_segflg
!= UIO_USERSPACE
|| uio
->uio_td
== curthread
,
145 WITNESS_WARN(WARN_GIANTOK
| WARN_SLEEPOK
, NULL
,
146 "Calling uiomove()");
148 save
= td
->td_pflags
& TDP_DEADLKTREAT
;
149 td
->td_pflags
|= TDP_DEADLKTREAT
;
151 while (n
> 0 && uio
->uio_resid
) {
162 switch (uio
->uio_segflg
) {
165 if (ticks
- PCPU_GET(switchticks
) >= hogticks
)
167 if (uio
->uio_rw
== UIO_READ
)
168 error
= copyout(cp
, iov
->iov_base
, cnt
);
170 error
= copyin(iov
->iov_base
, cp
, cnt
);
176 if (uio
->uio_rw
== UIO_READ
)
177 bcopy(cp
, iov
->iov_base
, cnt
);
179 bcopy(iov
->iov_base
, cp
, cnt
);
184 iov
->iov_base
= (char *)iov
->iov_base
+ cnt
;
186 uio
->uio_resid
-= cnt
;
187 uio
->uio_offset
+= cnt
;
188 cp
= (char *)cp
+ cnt
;
193 td
->td_pflags
&= ~TDP_DEADLKTREAT
;
198 * Wrapper for uiomove() that validates the arguments against a known-good
199 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which
200 * is almost definitely a bad thing, so we catch that here as well. We
201 * return a runtime failure, but it might be desirable to generate a runtime
202 * assertion failure instead.
205 uiomove_frombuf(void *buf
, int buflen
, struct uio
*uio
)
207 unsigned int offset
, n
;
209 if (uio
->uio_offset
< 0 || uio
->uio_resid
< 0 ||
210 (offset
= uio
->uio_offset
) != uio
->uio_offset
)
212 if (buflen
<= 0 || offset
>= buflen
)
214 if ((n
= buflen
- offset
) > INT_MAX
)
216 return (uiomove((char *)buf
+ offset
, n
, uio
));
219 #ifdef ZERO_COPY_SOCKETS
221 * Experimental support for zero-copy I/O
224 userspaceco(void *cp
, u_int cnt
, struct uio
*uio
, int disposable
)
230 if (uio
->uio_rw
== UIO_READ
) {
231 if ((so_zero_copy_receive
!= 0)
232 && ((cnt
& PAGE_MASK
) == 0)
233 && ((((intptr_t) iov
->iov_base
) & PAGE_MASK
) == 0)
234 && ((uio
->uio_offset
& PAGE_MASK
) == 0)
235 && ((((intptr_t) cp
) & PAGE_MASK
) == 0)
236 && (disposable
!= 0)) {
237 /* SOCKET: use page-trading */
239 * We only want to call vm_pgmoveco() on
240 * disposeable pages, since it gives the
241 * kernel page to the userland process.
243 error
= vm_pgmoveco(&curproc
->p_vmspace
->vm_map
,
244 (vm_offset_t
)cp
, (vm_offset_t
)iov
->iov_base
);
247 * If we get an error back, attempt
248 * to use copyout() instead. The
249 * disposable page should be freed
250 * automatically if we weren't able to move
254 error
= copyout(cp
, iov
->iov_base
, cnt
);
256 error
= copyout(cp
, iov
->iov_base
, cnt
);
259 error
= copyin(iov
->iov_base
, cp
, cnt
);
265 uiomoveco(void *cp
, int n
, struct uio
*uio
, int disposable
)
271 KASSERT(uio
->uio_rw
== UIO_READ
|| uio
->uio_rw
== UIO_WRITE
,
272 ("uiomoveco: mode"));
273 KASSERT(uio
->uio_segflg
!= UIO_USERSPACE
|| uio
->uio_td
== curthread
,
276 while (n
> 0 && uio
->uio_resid
) {
287 switch (uio
->uio_segflg
) {
290 if (ticks
- PCPU_GET(switchticks
) >= hogticks
)
293 error
= userspaceco(cp
, cnt
, uio
, disposable
);
300 if (uio
->uio_rw
== UIO_READ
)
301 bcopy(cp
, iov
->iov_base
, cnt
);
303 bcopy(iov
->iov_base
, cp
, cnt
);
308 iov
->iov_base
= (char *)iov
->iov_base
+ cnt
;
310 uio
->uio_resid
-= cnt
;
311 uio
->uio_offset
+= cnt
;
312 cp
= (char *)cp
+ cnt
;
317 #endif /* ZERO_COPY_SOCKETS */
320 * Give next character to user as result of read.
323 ureadc(int c
, struct uio
*uio
)
328 WITNESS_WARN(WARN_GIANTOK
| WARN_SLEEPOK
, NULL
,
332 if (uio
->uio_iovcnt
== 0 || uio
->uio_resid
== 0)
335 if (iov
->iov_len
== 0) {
340 switch (uio
->uio_segflg
) {
343 if (subyte(iov
->iov_base
, c
) < 0)
348 iov_base
= iov
->iov_base
;
350 iov
->iov_base
= iov_base
;
356 iov
->iov_base
= (char *)iov
->iov_base
+ 1;
364 * General routine to allocate a hash table with control of memory flags.
367 hashinit_flags(int elements
, struct malloc_type
*type
, u_long
*hashmask
,
371 LIST_HEAD(generic
, generic
) *hashtbl
;
375 panic("hashinit: bad elements");
377 /* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
378 KASSERT((flags
& HASH_WAITOK
) ^ (flags
& HASH_NOWAIT
),
379 ("Bad flags (0x%x) passed to hashinit_flags", flags
));
381 for (hashsize
= 1; hashsize
<= elements
; hashsize
<<= 1)
385 if (flags
& HASH_NOWAIT
)
386 hashtbl
= malloc((u_long
)hashsize
* sizeof(*hashtbl
),
389 hashtbl
= malloc((u_long
)hashsize
* sizeof(*hashtbl
),
392 if (hashtbl
!= NULL
) {
393 for (i
= 0; i
< hashsize
; i
++)
394 LIST_INIT(&hashtbl
[i
]);
395 *hashmask
= hashsize
- 1;
401 * Allocate and initialize a hash table with default flag: may sleep.
404 hashinit(int elements
, struct malloc_type
*type
, u_long
*hashmask
)
407 return (hashinit_flags(elements
, type
, hashmask
, HASH_WAITOK
));
411 hashdestroy(void *vhashtbl
, struct malloc_type
*type
, u_long hashmask
)
413 LIST_HEAD(generic
, generic
) *hashtbl
, *hp
;
416 for (hp
= hashtbl
; hp
<= &hashtbl
[hashmask
]; hp
++)
418 panic("hashdestroy: hash not empty");
422 static int primes
[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
423 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
424 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
425 #define NPRIMES (sizeof(primes) / sizeof(primes[0]))
428 * General routine to allocate a prime number sized hash table.
431 phashinit(int elements
, struct malloc_type
*type
, u_long
*nentries
)
434 LIST_HEAD(generic
, generic
) *hashtbl
;
438 panic("phashinit: bad elements");
439 for (i
= 1, hashsize
= primes
[1]; hashsize
<= elements
;) {
443 hashsize
= primes
[i
];
445 hashsize
= primes
[i
- 1];
446 hashtbl
= malloc((u_long
)hashsize
* sizeof(*hashtbl
), type
, M_WAITOK
);
447 for (i
= 0; i
< hashsize
; i
++)
448 LIST_INIT(&hashtbl
[i
]);
449 *nentries
= hashsize
;
461 sched_prio(td
, td
->td_user_pri
);
462 mi_switch(SW_INVOL
| SWT_RELINQUISH
, NULL
);
468 copyinfrom(const void * __restrict src
, void * __restrict dst
, size_t len
,
475 error
= copyin(src
, dst
, len
);
478 bcopy(src
, dst
, len
);
481 panic("copyinfrom: bad seg %d\n", seg
);
487 copyinstrfrom(const void * __restrict src
, void * __restrict dst
, size_t len
,
488 size_t * __restrict copied
, int seg
)
494 error
= copyinstr(src
, dst
, len
, copied
);
497 error
= copystr(src
, dst
, len
, copied
);
500 panic("copyinstrfrom: bad seg %d\n", seg
);
506 copyiniov(struct iovec
*iovp
, u_int iovcnt
, struct iovec
**iov
, int error
)
511 if (iovcnt
> UIO_MAXIOV
)
513 iovlen
= iovcnt
* sizeof (struct iovec
);
514 *iov
= malloc(iovlen
, M_IOV
, M_WAITOK
);
515 error
= copyin(iovp
, *iov
, iovlen
);
524 copyinuio(struct iovec
*iovp
, u_int iovcnt
, struct uio
**uiop
)
532 if (iovcnt
> UIO_MAXIOV
)
534 iovlen
= iovcnt
* sizeof (struct iovec
);
535 uio
= malloc(iovlen
+ sizeof *uio
, M_IOV
, M_WAITOK
);
536 iov
= (struct iovec
*)(uio
+ 1);
537 error
= copyin(iovp
, iov
, iovlen
);
543 uio
->uio_iovcnt
= iovcnt
;
544 uio
->uio_segflg
= UIO_USERSPACE
;
545 uio
->uio_offset
= -1;
547 for (i
= 0; i
< iovcnt
; i
++) {
548 if (iov
->iov_len
> INT_MAX
- uio
->uio_resid
) {
552 uio
->uio_resid
+= iov
->iov_len
;
560 cloneuio(struct uio
*uiop
)
565 iovlen
= uiop
->uio_iovcnt
* sizeof (struct iovec
);
566 uio
= malloc(iovlen
+ sizeof *uio
, M_IOV
, M_WAITOK
);
568 uio
->uio_iov
= (struct iovec
*)(uio
+ 1);
569 bcopy(uiop
->uio_iov
, uio
->uio_iov
, iovlen
);