1 /* $KAME: altq_subr.c,v 1.23 2004/04/20 16:10:06 itojun Exp $ */
4 * Copyright (C) 1997-2003
5 * Sony Computer Science Laboratories Inc. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 #include "opt_inet6.h"
33 #include <sys/param.h>
34 #include <sys/malloc.h>
36 #include <sys/systm.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/kernel.h>
41 #include <sys/callout.h>
42 #include <sys/errno.h>
43 #include <sys/syslog.h>
44 #include <sys/sysctl.h>
45 #include <sys/queue.h>
46 #include <sys/thread2.h>
49 #include <net/if_dl.h>
50 #include <net/if_types.h>
51 #include <net/ifq_var.h>
52 #include <net/netmsg2.h>
53 #include <net/netisr2.h>
55 #include <netinet/in.h>
56 #include <netinet/in_systm.h>
57 #include <netinet/ip.h>
59 #include <netinet/ip6.h>
61 #include <netinet/tcp.h>
62 #include <netinet/udp.h>
64 #include <net/pf/pfvar.h>
65 #include <net/altq/altq.h>
67 /* machine dependent clock related includes */
68 #include <machine/clock.h> /* for tsc_frequency */
69 #include <machine/md_var.h> /* for cpu_feature */
70 #include <machine/specialreg.h> /* for CPUID_TSC */
73 * internal function prototypes
75 static void tbr_timeout(void *);
76 static void tbr_timeout_dispatch(netmsg_t
);
77 static int altq_enable_locked(struct ifaltq
*);
78 static int altq_disable_locked(struct ifaltq
*);
79 static int altq_detach_locked(struct ifaltq
*);
80 static int tbr_set_locked(struct ifaltq
*, struct tb_profile
*);
82 int (*altq_input
)(struct mbuf
*, int) = NULL
;
83 static int tbr_timer
= 0; /* token bucket regulator timer */
84 static struct callout tbr_callout
;
85 static struct netmsg_base tbr_timeout_netmsg
;
87 int pfaltq_running
; /* keep track of running state */
89 MALLOC_DEFINE(M_ALTQ
, "altq", "ALTQ structures");
92 * alternate queueing support routines
95 /* look up the queue state by the interface name and the queueing type. */
97 altq_lookup(const char *name
, int type
)
101 if ((ifp
= ifunit(name
)) != NULL
) {
102 if (type
!= ALTQT_NONE
&& ifp
->if_snd
.altq_type
== type
)
103 return (ifp
->if_snd
.altq_disc
);
110 altq_attach(struct ifaltq
*ifq
, int type
, void *discipline
,
111 altq_mapsubq_t mapsubq
,
112 ifsq_enqueue_t enqueue
, ifsq_dequeue_t dequeue
, ifsq_request_t request
,
114 void *(*classify
)(struct ifaltq
*, struct mbuf
*, struct altq_pktattr
*))
116 if (!ifq_is_ready(ifq
))
119 ifq
->altq_type
= type
;
120 ifq
->altq_disc
= discipline
;
121 ifq
->altq_clfier
= clfier
;
122 ifq
->altq_classify
= classify
;
123 ifq
->altq_flags
&= (ALTQF_CANTCHANGE
|ALTQF_ENABLED
);
124 ifq_set_methods(ifq
, mapsubq
, enqueue
, dequeue
, request
);
129 altq_detach_locked(struct ifaltq
*ifq
)
131 if (!ifq_is_ready(ifq
))
133 if (ifq_is_enabled(ifq
))
135 if (!ifq_is_attached(ifq
))
138 ifq_set_classic(ifq
);
139 ifq
->altq_type
= ALTQT_NONE
;
140 ifq
->altq_disc
= NULL
;
141 ifq
->altq_clfier
= NULL
;
142 ifq
->altq_classify
= NULL
;
143 ifq
->altq_flags
&= ALTQF_CANTCHANGE
;
148 altq_detach(struct ifaltq
*ifq
)
153 error
= altq_detach_locked(ifq
);
159 altq_enable_locked(struct ifaltq
*ifq
)
161 if (!ifq_is_ready(ifq
))
163 if (ifq_is_enabled(ifq
))
166 ifq_purge_all_locked(ifq
);
168 ifq
->altq_flags
|= ALTQF_ENABLED
;
169 if (ifq
->altq_clfier
!= NULL
)
170 ifq
->altq_flags
|= ALTQF_CLASSIFY
;
175 altq_enable(struct ifaltq
*ifq
)
180 error
= altq_enable_locked(ifq
);
186 altq_disable_locked(struct ifaltq
*ifq
)
188 if (!ifq_is_enabled(ifq
))
191 ifq_purge_all_locked(ifq
);
192 ifq
->altq_flags
&= ~(ALTQF_ENABLED
|ALTQF_CLASSIFY
);
197 altq_disable(struct ifaltq
*ifq
)
202 error
= altq_disable_locked(ifq
);
208 * internal representation of token bucket parameters
209 * rate: byte_per_unittime << 32
210 * (((bits_per_sec) / 8) << 32) / machclk_freq
215 #define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT)
216 #define TBR_UNSCALE(x) ((x) >> TBR_SHIFT)
219 tbr_dequeue(struct ifaltq_subque
*ifsq
, int op
)
221 struct ifaltq
*ifq
= ifsq
->ifsq_altq
;
222 struct tb_regulator
*tbr
;
227 if (ifsq_get_index(ifsq
) != ALTQ_SUBQ_INDEX_DEFAULT
) {
229 * Race happened, the unrelated subqueue was
230 * picked during the packet scheduler transition.
232 ifsq_classic_request(ifsq
, ALTRQ_PURGE
, NULL
);
238 if (op
== ALTDQ_REMOVE
&& tbr
->tbr_lastop
== ALTDQ_POLL
) {
239 /* if this is a remove after poll, bypass tbr check */
241 /* update token only when it is negative */
242 if (tbr
->tbr_token
<= 0) {
243 now
= read_machclk();
244 interval
= now
- tbr
->tbr_last
;
245 if (interval
>= tbr
->tbr_filluptime
)
246 tbr
->tbr_token
= tbr
->tbr_depth
;
248 tbr
->tbr_token
+= interval
* tbr
->tbr_rate
;
249 if (tbr
->tbr_token
> tbr
->tbr_depth
)
250 tbr
->tbr_token
= tbr
->tbr_depth
;
254 /* if token is still negative, don't allow dequeue */
255 if (tbr
->tbr_token
<= 0) {
261 if (ifq_is_enabled(ifq
))
262 m
= (*ifsq
->ifsq_dequeue
)(ifsq
, op
);
264 m
= ifsq_classic_dequeue(ifsq
, op
);
266 if (m
!= NULL
&& op
== ALTDQ_REMOVE
)
267 tbr
->tbr_token
-= TBR_SCALE(m_pktlen(m
));
268 tbr
->tbr_lastop
= op
;
274 * set a token bucket regulator.
275 * if the specified rate is zero, the token bucket regulator is deleted.
278 tbr_set_locked(struct ifaltq
*ifq
, struct tb_profile
*profile
)
280 struct tb_regulator
*tbr
, *otbr
;
282 if (machclk_freq
== 0)
284 if (machclk_freq
== 0) {
285 kprintf("%s: no cpu clock available!\n", __func__
);
289 if (profile
->rate
== 0) {
290 /* delete this tbr */
291 if ((tbr
= ifq
->altq_tbr
) == NULL
)
293 ifq
->altq_tbr
= NULL
;
298 tbr
= kmalloc(sizeof(*tbr
), M_ALTQ
, M_WAITOK
| M_ZERO
);
299 tbr
->tbr_rate
= TBR_SCALE(profile
->rate
/ 8) / machclk_freq
;
300 tbr
->tbr_depth
= TBR_SCALE(profile
->depth
);
301 if (tbr
->tbr_rate
> 0)
302 tbr
->tbr_filluptime
= tbr
->tbr_depth
/ tbr
->tbr_rate
;
304 tbr
->tbr_filluptime
= 0xffffffffffffffffLL
;
305 tbr
->tbr_token
= tbr
->tbr_depth
;
306 tbr
->tbr_last
= read_machclk();
307 tbr
->tbr_lastop
= ALTDQ_REMOVE
;
309 otbr
= ifq
->altq_tbr
;
310 ifq
->altq_tbr
= tbr
; /* set the new tbr */
314 else if (tbr_timer
== 0) {
315 callout_reset_bycpu(&tbr_callout
, 1, tbr_timeout
, NULL
, 0);
322 tbr_set(struct ifaltq
*ifq
, struct tb_profile
*profile
)
327 error
= tbr_set_locked(ifq
, profile
);
333 tbr_timeout(void *arg __unused
)
335 struct lwkt_msg
*lmsg
= &tbr_timeout_netmsg
.lmsg
;
337 KASSERT(mycpuid
== 0, ("not on cpu0"));
339 if (lmsg
->ms_flags
& MSGF_DONE
)
340 lwkt_sendmsg_oncpu(netisr_cpuport(0), lmsg
);
345 * tbr_timeout goes through the interface list, and kicks the drivers
349 tbr_timeout_dispatch(netmsg_t nmsg
)
351 const struct ifnet_array
*arr
;
357 lwkt_replymsg(&nmsg
->lmsg
, 0); /* reply ASAP */
361 arr
= ifnet_array_get();
362 for (i
= 0; i
< arr
->ifnet_count
; ++i
) {
363 struct ifnet
*ifp
= arr
->ifnet_arr
[i
];
364 struct ifaltq_subque
*ifsq
;
366 if (ifp
->if_snd
.altq_tbr
== NULL
)
369 ifsq
= &ifp
->if_snd
.altq_subq
[ALTQ_SUBQ_INDEX_DEFAULT
];
371 if (!ifsq_is_empty(ifsq
) && ifp
->if_start
!= NULL
) {
372 ifsq_serialize_hw(ifsq
);
373 (*ifp
->if_start
)(ifp
, ifsq
);
374 ifsq_deserialize_hw(ifsq
);
378 callout_reset(&tbr_callout
, 1, tbr_timeout
, NULL
);
380 tbr_timer
= 0; /* don't need tbr_timer anymore */
384 * get token bucket regulator profile
387 tbr_get(struct ifaltq
*ifq
, struct tb_profile
*profile
)
389 struct tb_regulator
*tbr
;
391 if ((tbr
= ifq
->altq_tbr
) == NULL
) {
396 (u_int
)TBR_UNSCALE(tbr
->tbr_rate
* 8 * machclk_freq
);
397 profile
->depth
= (u_int
)TBR_UNSCALE(tbr
->tbr_depth
);
403 * attach a discipline to the interface. if one already exists, it is
407 altq_pfattach(struct pf_altq
*a
)
413 if (a
->scheduler
== ALTQT_NONE
)
416 if (a
->altq_disc
== NULL
)
421 ifp
= ifunit(a
->ifname
);
430 switch (a
->scheduler
) {
433 error
= cbq_pfattach(a
, ifq
);
438 error
= priq_pfattach(a
, ifq
);
443 error
= hfsc_pfattach(a
, ifq
);
448 error
= fairq_pfattach(a
, ifq
);
456 /* if the state is running, enable altq */
457 if (error
== 0 && pfaltq_running
&& ifq
->altq_type
!= ALTQT_NONE
&&
458 !ifq_is_enabled(ifq
))
459 error
= altq_enable_locked(ifq
);
461 /* if altq is already enabled, reset set tokenbucket regulator */
462 if (error
== 0 && ifq_is_enabled(ifq
)) {
463 struct tb_profile tb
;
465 tb
.rate
= a
->ifbandwidth
;
466 tb
.depth
= a
->tbrsize
;
467 error
= tbr_set_locked(ifq
, &tb
);
476 * detach a discipline from the interface.
477 * it is possible that the discipline was already overridden by another
481 altq_pfdetach(struct pf_altq
*a
)
489 ifp
= ifunit(a
->ifname
);
496 /* if this discipline is no longer referenced, just return */
497 if (a
->altq_disc
== NULL
) {
504 if (a
->altq_disc
!= ifq
->altq_disc
)
507 if (ifq_is_enabled(ifq
))
508 error
= altq_disable_locked(ifq
);
510 error
= altq_detach_locked(ifq
);
519 * add a discipline or a queue
522 altq_add(struct pf_altq
*a
)
526 if (a
->qname
[0] != 0)
527 return (altq_add_queue(a
));
529 if (machclk_freq
== 0)
531 if (machclk_freq
== 0)
532 panic("altq_add: no cpu clock");
534 switch (a
->scheduler
) {
537 error
= cbq_add_altq(a
);
542 error
= priq_add_altq(a
);
547 error
= hfsc_add_altq(a
);
552 error
= fairq_add_altq(a
);
563 * remove a discipline or a queue
566 altq_remove(struct pf_altq
*a
)
570 if (a
->qname
[0] != 0)
571 return (altq_remove_queue(a
));
573 switch (a
->scheduler
) {
576 error
= cbq_remove_altq(a
);
581 error
= priq_remove_altq(a
);
586 error
= hfsc_remove_altq(a
);
591 error
= fairq_remove_altq(a
);
602 * add a queue to the discipline
605 altq_add_queue(struct pf_altq
*a
)
609 switch (a
->scheduler
) {
612 error
= cbq_add_queue(a
);
617 error
= priq_add_queue(a
);
622 error
= hfsc_add_queue(a
);
627 error
= fairq_add_queue(a
);
638 * remove a queue from the discipline
641 altq_remove_queue(struct pf_altq
*a
)
645 switch (a
->scheduler
) {
648 error
= cbq_remove_queue(a
);
653 error
= priq_remove_queue(a
);
658 error
= hfsc_remove_queue(a
);
663 error
= fairq_remove_queue(a
);
674 * get queue statistics
677 altq_getqstats(struct pf_altq
*a
, void *ubuf
, int *nbytes
)
681 switch (a
->scheduler
) {
684 error
= cbq_getqstats(a
, ubuf
, nbytes
);
689 error
= priq_getqstats(a
, ubuf
, nbytes
);
694 error
= hfsc_getqstats(a
, ubuf
, nbytes
);
699 error
= fairq_getqstats(a
, ubuf
, nbytes
);
710 * read and write diffserv field in IPv4 or IPv6 header
713 read_dsfield(struct mbuf
*m
, struct altq_pktattr
*pktattr
)
716 uint8_t ds_field
= 0;
718 if (pktattr
== NULL
||
719 (pktattr
->pattr_af
!= AF_INET
&& pktattr
->pattr_af
!= AF_INET6
))
722 /* verify that pattr_hdr is within the mbuf data */
723 for (m0
= m
; m0
!= NULL
; m0
= m0
->m_next
) {
724 if ((pktattr
->pattr_hdr
>= m0
->m_data
) &&
725 (pktattr
->pattr_hdr
< m0
->m_data
+ m0
->m_len
))
729 /* ick, pattr_hdr is stale */
730 pktattr
->pattr_af
= AF_UNSPEC
;
732 kprintf("read_dsfield: can't locate header!\n");
737 if (pktattr
->pattr_af
== AF_INET
) {
738 struct ip
*ip
= (struct ip
*)pktattr
->pattr_hdr
;
741 return ((uint8_t)0); /* version mismatch! */
742 ds_field
= ip
->ip_tos
;
745 else if (pktattr
->pattr_af
== AF_INET6
) {
746 struct ip6_hdr
*ip6
= (struct ip6_hdr
*)pktattr
->pattr_hdr
;
749 flowlabel
= ntohl(ip6
->ip6_flow
);
750 if ((flowlabel
>> 28) != 6)
751 return ((uint8_t)0); /* version mismatch! */
752 ds_field
= (flowlabel
>> 20) & 0xff;
759 write_dsfield(struct mbuf
*m
, struct altq_pktattr
*pktattr
, uint8_t dsfield
)
763 if (pktattr
== NULL
||
764 (pktattr
->pattr_af
!= AF_INET
&& pktattr
->pattr_af
!= AF_INET6
))
767 /* verify that pattr_hdr is within the mbuf data */
768 for (m0
= m
; m0
!= NULL
; m0
= m0
->m_next
) {
769 if ((pktattr
->pattr_hdr
>= m0
->m_data
) &&
770 (pktattr
->pattr_hdr
< m0
->m_data
+ m0
->m_len
))
774 /* ick, pattr_hdr is stale */
775 pktattr
->pattr_af
= AF_UNSPEC
;
777 kprintf("write_dsfield: can't locate header!\n");
782 if (pktattr
->pattr_af
== AF_INET
) {
783 struct ip
*ip
= (struct ip
*)pktattr
->pattr_hdr
;
788 return; /* version mismatch! */
790 dsfield
|= old
& 3; /* leave CU bits */
793 ip
->ip_tos
= dsfield
;
795 * update checksum (from RFC1624)
796 * HC' = ~(~HC + ~m + m')
798 sum
= ~ntohs(ip
->ip_sum
) & 0xffff;
799 sum
+= 0xff00 + (~old
& 0xff) + dsfield
;
800 sum
= (sum
>> 16) + (sum
& 0xffff);
801 sum
+= (sum
>> 16); /* add carry */
803 ip
->ip_sum
= htons(~sum
& 0xffff);
806 else if (pktattr
->pattr_af
== AF_INET6
) {
807 struct ip6_hdr
*ip6
= (struct ip6_hdr
*)pktattr
->pattr_hdr
;
810 flowlabel
= ntohl(ip6
->ip6_flow
);
811 if ((flowlabel
>> 28) != 6)
812 return; /* version mismatch! */
813 flowlabel
= (flowlabel
& 0xf03fffff) | (dsfield
<< 20);
814 ip6
->ip6_flow
= htonl(flowlabel
);
820 * high resolution clock support taking advantage of a machine dependent
821 * high resolution time counter (e.g., timestamp counter of intel pentium).
823 * - 64-bit-long monotonically-increasing counter
824 * - frequency range is 100M-4GHz (CPU speed)
826 /* if pcc is not available or disabled, emulate 256MHz using microtime() */
827 #define MACHCLK_SHIFT 8
829 static int machclk_usepcc
;
830 uint64_t machclk_freq
= 0;
831 uint32_t machclk_per_tick
= 0;
836 callout_init_mp(&tbr_callout
);
837 netmsg_init(&tbr_timeout_netmsg
, NULL
, &netisr_adone_rport
,
838 MSGF_PRIORITY
, tbr_timeout_dispatch
);
846 #if defined(__x86_64__)
853 if (!machclk_usepcc
) {
854 /* emulate 256MHz using microuptime() */
855 machclk_freq
= 1000000LLU << MACHCLK_SHIFT
;
856 machclk_per_tick
= machclk_freq
/ hz
;
858 kprintf("altq: emulate %juHz cpu clock\n",
859 (uintmax_t)machclk_freq
);
865 * If the clock frequency (of Pentium TSC) is accessible,
868 #ifdef _RDTSC_SUPPORTED_
870 machclk_freq
= tsc_frequency
;
874 * If we don't know the clock frequency, measure it.
876 if (machclk_freq
== 0) {
878 struct timeval tv_start
, tv_end
;
879 uint64_t start
, end
, diff
;
882 microtime(&tv_start
);
883 start
= read_machclk();
884 timo
= hz
; /* 1 sec */
885 tsleep(&wait
, PCATCH
, "init_machclk", timo
);
887 end
= read_machclk();
888 diff
= (uint64_t)(tv_end
.tv_sec
- tv_start
.tv_sec
) * 1000000
889 + tv_end
.tv_usec
- tv_start
.tv_usec
;
891 machclk_freq
= (end
- start
) * 1000000 / diff
;
894 machclk_per_tick
= machclk_freq
/ hz
;
897 kprintf("altq: CPU clock: %juHz\n", (uintmax_t)machclk_freq
);
906 if (machclk_usepcc
) {
907 #ifdef _RDTSC_SUPPORTED_
910 panic("read_machclk");
916 val
= (((uint64_t)tv
.tv_sec
* 1000000 + tv
.tv_usec
) <<