2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/seq_file.h>
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36 #include <linux/mutex.h>
38 #include <net/net_namespace.h>
40 #include <net/route.h>
43 #include <asm/uaccess.h>
45 #include <net/ip_vs.h>
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DEFINE_MUTEX(__ip_vs_mutex
);
50 /* lock for service table */
51 static DEFINE_RWLOCK(__ip_vs_svc_lock
);
53 /* lock for table with the real services */
54 static DEFINE_RWLOCK(__ip_vs_rs_lock
);
56 /* lock for state and timeout tables */
57 static DEFINE_RWLOCK(__ip_vs_securetcp_lock
);
59 /* lock for drop entry handling */
60 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock
);
62 /* lock for drop packet handling */
63 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock
);
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate
= 0;
67 int ip_vs_drop_counter
= 0;
68 static atomic_t ip_vs_dropentry
= ATOMIC_INIT(0);
70 /* number of virtual services */
71 static int ip_vs_num_services
= 0;
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry
= 0;
75 static int sysctl_ip_vs_drop_packet
= 0;
76 static int sysctl_ip_vs_secure_tcp
= 0;
77 static int sysctl_ip_vs_amemthresh
= 1024;
78 static int sysctl_ip_vs_am_droprate
= 10;
79 int sysctl_ip_vs_cache_bypass
= 0;
80 int sysctl_ip_vs_expire_nodest_conn
= 0;
81 int sysctl_ip_vs_expire_quiescent_template
= 0;
82 int sysctl_ip_vs_sync_threshold
[2] = { 3, 50 };
83 int sysctl_ip_vs_nat_icmp_send
= 0;
86 #ifdef CONFIG_IP_VS_DEBUG
87 static int sysctl_ip_vs_debug_level
= 0;
89 int ip_vs_get_debug_level(void)
91 return sysctl_ip_vs_debug_level
;
96 * update_defense_level is called from keventd and from sysctl,
97 * so it needs to protect itself from softirqs
99 static void update_defense_level(void)
102 static int old_secure_tcp
= 0;
107 /* we only count free and buffered memory (in pages) */
109 availmem
= i
.freeram
+ i
.bufferram
;
110 /* however in linux 2.5 the i.bufferram is total page cache size,
112 /* si_swapinfo(&i); */
113 /* availmem = availmem - (i.totalswap - i.freeswap); */
115 nomem
= (availmem
< sysctl_ip_vs_amemthresh
);
120 spin_lock(&__ip_vs_dropentry_lock
);
121 switch (sysctl_ip_vs_drop_entry
) {
123 atomic_set(&ip_vs_dropentry
, 0);
127 atomic_set(&ip_vs_dropentry
, 1);
128 sysctl_ip_vs_drop_entry
= 2;
130 atomic_set(&ip_vs_dropentry
, 0);
135 atomic_set(&ip_vs_dropentry
, 1);
137 atomic_set(&ip_vs_dropentry
, 0);
138 sysctl_ip_vs_drop_entry
= 1;
142 atomic_set(&ip_vs_dropentry
, 1);
145 spin_unlock(&__ip_vs_dropentry_lock
);
148 spin_lock(&__ip_vs_droppacket_lock
);
149 switch (sysctl_ip_vs_drop_packet
) {
155 ip_vs_drop_rate
= ip_vs_drop_counter
156 = sysctl_ip_vs_amemthresh
/
157 (sysctl_ip_vs_amemthresh
-availmem
);
158 sysctl_ip_vs_drop_packet
= 2;
165 ip_vs_drop_rate
= ip_vs_drop_counter
166 = sysctl_ip_vs_amemthresh
/
167 (sysctl_ip_vs_amemthresh
-availmem
);
170 sysctl_ip_vs_drop_packet
= 1;
174 ip_vs_drop_rate
= sysctl_ip_vs_am_droprate
;
177 spin_unlock(&__ip_vs_droppacket_lock
);
180 write_lock(&__ip_vs_securetcp_lock
);
181 switch (sysctl_ip_vs_secure_tcp
) {
183 if (old_secure_tcp
>= 2)
188 if (old_secure_tcp
< 2)
190 sysctl_ip_vs_secure_tcp
= 2;
192 if (old_secure_tcp
>= 2)
198 if (old_secure_tcp
< 2)
201 if (old_secure_tcp
>= 2)
203 sysctl_ip_vs_secure_tcp
= 1;
207 if (old_secure_tcp
< 2)
211 old_secure_tcp
= sysctl_ip_vs_secure_tcp
;
213 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp
>1);
214 write_unlock(&__ip_vs_securetcp_lock
);
221 * Timer for checking the defense
223 #define DEFENSE_TIMER_PERIOD 1*HZ
224 static void defense_work_handler(struct work_struct
*work
);
225 static DECLARE_DELAYED_WORK(defense_work
, defense_work_handler
);
227 static void defense_work_handler(struct work_struct
*work
)
229 update_defense_level();
230 if (atomic_read(&ip_vs_dropentry
))
231 ip_vs_random_dropentry();
233 schedule_delayed_work(&defense_work
, DEFENSE_TIMER_PERIOD
);
237 ip_vs_use_count_inc(void)
239 return try_module_get(THIS_MODULE
);
243 ip_vs_use_count_dec(void)
245 module_put(THIS_MODULE
);
250 * Hash table: for virtual service lookups
252 #define IP_VS_SVC_TAB_BITS 8
253 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
256 /* the service table hashed by <protocol, addr, port> */
257 static struct list_head ip_vs_svc_table
[IP_VS_SVC_TAB_SIZE
];
258 /* the service table hashed by fwmark */
259 static struct list_head ip_vs_svc_fwm_table
[IP_VS_SVC_TAB_SIZE
];
262 * Hash table: for real service lookups
264 #define IP_VS_RTAB_BITS 4
265 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
268 static struct list_head ip_vs_rtable
[IP_VS_RTAB_SIZE
];
271 * Trash for destinations
273 static LIST_HEAD(ip_vs_dest_trash
);
276 * FTP & NULL virtual service counters
278 static atomic_t ip_vs_ftpsvc_counter
= ATOMIC_INIT(0);
279 static atomic_t ip_vs_nullsvc_counter
= ATOMIC_INIT(0);
283 * Returns hash value for virtual service
285 static __inline__
unsigned
286 ip_vs_svc_hashkey(unsigned proto
, __be32 addr
, __be16 port
)
288 register unsigned porth
= ntohs(port
);
290 return (proto
^ntohl(addr
)^(porth
>>IP_VS_SVC_TAB_BITS
)^porth
)
291 & IP_VS_SVC_TAB_MASK
;
295 * Returns hash value of fwmark for virtual service lookup
297 static __inline__
unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark
)
299 return fwmark
& IP_VS_SVC_TAB_MASK
;
303 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 * or in the ip_vs_svc_fwm_table by fwmark.
305 * Should be called with locked tables.
307 static int ip_vs_svc_hash(struct ip_vs_service
*svc
)
311 if (svc
->flags
& IP_VS_SVC_F_HASHED
) {
312 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313 "called from %p\n", __builtin_return_address(0));
317 if (svc
->fwmark
== 0) {
319 * Hash it by <protocol,addr,port> in ip_vs_svc_table
321 hash
= ip_vs_svc_hashkey(svc
->protocol
, svc
->addr
, svc
->port
);
322 list_add(&svc
->s_list
, &ip_vs_svc_table
[hash
]);
325 * Hash it by fwmark in ip_vs_svc_fwm_table
327 hash
= ip_vs_svc_fwm_hashkey(svc
->fwmark
);
328 list_add(&svc
->f_list
, &ip_vs_svc_fwm_table
[hash
]);
331 svc
->flags
|= IP_VS_SVC_F_HASHED
;
332 /* increase its refcnt because it is referenced by the svc table */
333 atomic_inc(&svc
->refcnt
);
339 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 * Should be called with locked tables.
342 static int ip_vs_svc_unhash(struct ip_vs_service
*svc
)
344 if (!(svc
->flags
& IP_VS_SVC_F_HASHED
)) {
345 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346 "called from %p\n", __builtin_return_address(0));
350 if (svc
->fwmark
== 0) {
351 /* Remove it from the ip_vs_svc_table table */
352 list_del(&svc
->s_list
);
354 /* Remove it from the ip_vs_svc_fwm_table table */
355 list_del(&svc
->f_list
);
358 svc
->flags
&= ~IP_VS_SVC_F_HASHED
;
359 atomic_dec(&svc
->refcnt
);
365 * Get service by {proto,addr,port} in the service table.
367 static __inline__
struct ip_vs_service
*
368 __ip_vs_service_get(__u16 protocol
, __be32 vaddr
, __be16 vport
)
371 struct ip_vs_service
*svc
;
373 /* Check for "full" addressed entries */
374 hash
= ip_vs_svc_hashkey(protocol
, vaddr
, vport
);
376 list_for_each_entry(svc
, &ip_vs_svc_table
[hash
], s_list
){
377 if ((svc
->addr
== vaddr
)
378 && (svc
->port
== vport
)
379 && (svc
->protocol
== protocol
)) {
381 atomic_inc(&svc
->usecnt
);
391 * Get service by {fwmark} in the service table.
393 static __inline__
struct ip_vs_service
*__ip_vs_svc_fwm_get(__u32 fwmark
)
396 struct ip_vs_service
*svc
;
398 /* Check for fwmark addressed entries */
399 hash
= ip_vs_svc_fwm_hashkey(fwmark
);
401 list_for_each_entry(svc
, &ip_vs_svc_fwm_table
[hash
], f_list
) {
402 if (svc
->fwmark
== fwmark
) {
404 atomic_inc(&svc
->usecnt
);
412 struct ip_vs_service
*
413 ip_vs_service_get(__u32 fwmark
, __u16 protocol
, __be32 vaddr
, __be16 vport
)
415 struct ip_vs_service
*svc
;
417 read_lock(&__ip_vs_svc_lock
);
420 * Check the table hashed by fwmark first
422 if (fwmark
&& (svc
= __ip_vs_svc_fwm_get(fwmark
)))
426 * Check the table hashed by <protocol,addr,port>
427 * for "full" addressed entries
429 svc
= __ip_vs_service_get(protocol
, vaddr
, vport
);
432 && protocol
== IPPROTO_TCP
433 && atomic_read(&ip_vs_ftpsvc_counter
)
434 && (vport
== FTPDATA
|| ntohs(vport
) >= PROT_SOCK
)) {
436 * Check if ftp service entry exists, the packet
437 * might belong to FTP data connections.
439 svc
= __ip_vs_service_get(protocol
, vaddr
, FTPPORT
);
443 && atomic_read(&ip_vs_nullsvc_counter
)) {
445 * Check if the catch-all port (port zero) exists
447 svc
= __ip_vs_service_get(protocol
, vaddr
, 0);
451 read_unlock(&__ip_vs_svc_lock
);
453 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454 fwmark
, ip_vs_proto_name(protocol
),
455 NIPQUAD(vaddr
), ntohs(vport
),
456 svc
?"hit":"not hit");
463 __ip_vs_bind_svc(struct ip_vs_dest
*dest
, struct ip_vs_service
*svc
)
465 atomic_inc(&svc
->refcnt
);
470 __ip_vs_unbind_svc(struct ip_vs_dest
*dest
)
472 struct ip_vs_service
*svc
= dest
->svc
;
475 if (atomic_dec_and_test(&svc
->refcnt
))
481 * Returns hash value for real service
483 static __inline__
unsigned ip_vs_rs_hashkey(__be32 addr
, __be16 port
)
485 register unsigned porth
= ntohs(port
);
487 return (ntohl(addr
)^(porth
>>IP_VS_RTAB_BITS
)^porth
)
492 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 * should be called with locked tables.
495 static int ip_vs_rs_hash(struct ip_vs_dest
*dest
)
499 if (!list_empty(&dest
->d_list
)) {
504 * Hash by proto,addr,port,
505 * which are the parameters of the real service.
507 hash
= ip_vs_rs_hashkey(dest
->addr
, dest
->port
);
508 list_add(&dest
->d_list
, &ip_vs_rtable
[hash
]);
514 * UNhashes ip_vs_dest from ip_vs_rtable.
515 * should be called with locked tables.
517 static int ip_vs_rs_unhash(struct ip_vs_dest
*dest
)
520 * Remove it from the ip_vs_rtable table.
522 if (!list_empty(&dest
->d_list
)) {
523 list_del(&dest
->d_list
);
524 INIT_LIST_HEAD(&dest
->d_list
);
531 * Lookup real service by <proto,addr,port> in the real service table.
534 ip_vs_lookup_real_service(__u16 protocol
, __be32 daddr
, __be16 dport
)
537 struct ip_vs_dest
*dest
;
540 * Check for "full" addressed entries
541 * Return the first found entry
543 hash
= ip_vs_rs_hashkey(daddr
, dport
);
545 read_lock(&__ip_vs_rs_lock
);
546 list_for_each_entry(dest
, &ip_vs_rtable
[hash
], d_list
) {
547 if ((dest
->addr
== daddr
)
548 && (dest
->port
== dport
)
549 && ((dest
->protocol
== protocol
) ||
552 read_unlock(&__ip_vs_rs_lock
);
556 read_unlock(&__ip_vs_rs_lock
);
562 * Lookup destination by {addr,port} in the given service
564 static struct ip_vs_dest
*
565 ip_vs_lookup_dest(struct ip_vs_service
*svc
, __be32 daddr
, __be16 dport
)
567 struct ip_vs_dest
*dest
;
570 * Find the destination for the given service
572 list_for_each_entry(dest
, &svc
->destinations
, n_list
) {
573 if ((dest
->addr
== daddr
) && (dest
->port
== dport
)) {
583 * Find destination by {daddr,dport,vaddr,protocol}
584 * Cretaed to be used in ip_vs_process_message() in
585 * the backup synchronization daemon. It finds the
586 * destination to be bound to the received connection
589 * ip_vs_lookup_real_service() looked promissing, but
590 * seems not working as expected.
592 struct ip_vs_dest
*ip_vs_find_dest(__be32 daddr
, __be16 dport
,
593 __be32 vaddr
, __be16 vport
, __u16 protocol
)
595 struct ip_vs_dest
*dest
;
596 struct ip_vs_service
*svc
;
598 svc
= ip_vs_service_get(0, protocol
, vaddr
, vport
);
601 dest
= ip_vs_lookup_dest(svc
, daddr
, dport
);
603 atomic_inc(&dest
->refcnt
);
604 ip_vs_service_put(svc
);
607 EXPORT_SYMBOL(ip_vs_find_dest
);
610 * Lookup dest by {svc,addr,port} in the destination trash.
611 * The destination trash is used to hold the destinations that are removed
612 * from the service table but are still referenced by some conn entries.
613 * The reason to add the destination trash is when the dest is temporary
614 * down (either by administrator or by monitor program), the dest can be
615 * picked back from the trash, the remaining connections to the dest can
616 * continue, and the counting information of the dest is also useful for
619 static struct ip_vs_dest
*
620 ip_vs_trash_get_dest(struct ip_vs_service
*svc
, __be32 daddr
, __be16 dport
)
622 struct ip_vs_dest
*dest
, *nxt
;
625 * Find the destination in trash
627 list_for_each_entry_safe(dest
, nxt
, &ip_vs_dest_trash
, n_list
) {
628 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
631 NIPQUAD(dest
->addr
), ntohs(dest
->port
),
632 atomic_read(&dest
->refcnt
));
633 if (dest
->addr
== daddr
&&
634 dest
->port
== dport
&&
635 dest
->vfwmark
== svc
->fwmark
&&
636 dest
->protocol
== svc
->protocol
&&
638 (dest
->vaddr
== svc
->addr
&&
639 dest
->vport
== svc
->port
))) {
645 * Try to purge the destination from trash if not referenced
647 if (atomic_read(&dest
->refcnt
) == 1) {
648 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
651 NIPQUAD(dest
->addr
), ntohs(dest
->port
));
652 list_del(&dest
->n_list
);
653 ip_vs_dst_reset(dest
);
654 __ip_vs_unbind_svc(dest
);
664 * Clean up all the destinations in the trash
665 * Called by the ip_vs_control_cleanup()
667 * When the ip_vs_control_clearup is activated by ipvs module exit,
668 * the service tables must have been flushed and all the connections
669 * are expired, and the refcnt of each destination in the trash must
670 * be 1, so we simply release them here.
672 static void ip_vs_trash_cleanup(void)
674 struct ip_vs_dest
*dest
, *nxt
;
676 list_for_each_entry_safe(dest
, nxt
, &ip_vs_dest_trash
, n_list
) {
677 list_del(&dest
->n_list
);
678 ip_vs_dst_reset(dest
);
679 __ip_vs_unbind_svc(dest
);
686 ip_vs_zero_stats(struct ip_vs_stats
*stats
)
688 spin_lock_bh(&stats
->lock
);
689 memset(stats
, 0, (char *)&stats
->lock
- (char *)stats
);
690 spin_unlock_bh(&stats
->lock
);
691 ip_vs_zero_estimator(stats
);
695 * Update a destination in the given service
698 __ip_vs_update_dest(struct ip_vs_service
*svc
,
699 struct ip_vs_dest
*dest
, struct ip_vs_dest_user
*udest
)
703 /* set the weight and the flags */
704 atomic_set(&dest
->weight
, udest
->weight
);
705 conn_flags
= udest
->conn_flags
| IP_VS_CONN_F_INACTIVE
;
707 /* check if local node and update the flags */
708 if (inet_addr_type(udest
->addr
) == RTN_LOCAL
) {
709 conn_flags
= (conn_flags
& ~IP_VS_CONN_F_FWD_MASK
)
710 | IP_VS_CONN_F_LOCALNODE
;
713 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
714 if ((conn_flags
& IP_VS_CONN_F_FWD_MASK
) != 0) {
715 conn_flags
|= IP_VS_CONN_F_NOOUTPUT
;
718 * Put the real service in ip_vs_rtable if not present.
719 * For now only for NAT!
721 write_lock_bh(&__ip_vs_rs_lock
);
723 write_unlock_bh(&__ip_vs_rs_lock
);
725 atomic_set(&dest
->conn_flags
, conn_flags
);
727 /* bind the service */
729 __ip_vs_bind_svc(dest
, svc
);
731 if (dest
->svc
!= svc
) {
732 __ip_vs_unbind_svc(dest
);
733 ip_vs_zero_stats(&dest
->stats
);
734 __ip_vs_bind_svc(dest
, svc
);
738 /* set the dest status flags */
739 dest
->flags
|= IP_VS_DEST_F_AVAILABLE
;
741 if (udest
->u_threshold
== 0 || udest
->u_threshold
> dest
->u_threshold
)
742 dest
->flags
&= ~IP_VS_DEST_F_OVERLOAD
;
743 dest
->u_threshold
= udest
->u_threshold
;
744 dest
->l_threshold
= udest
->l_threshold
;
749 * Create a destination for the given service
752 ip_vs_new_dest(struct ip_vs_service
*svc
, struct ip_vs_dest_user
*udest
,
753 struct ip_vs_dest
**dest_p
)
755 struct ip_vs_dest
*dest
;
760 atype
= inet_addr_type(udest
->addr
);
761 if (atype
!= RTN_LOCAL
&& atype
!= RTN_UNICAST
)
764 dest
= kzalloc(sizeof(struct ip_vs_dest
), GFP_ATOMIC
);
766 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
770 dest
->protocol
= svc
->protocol
;
771 dest
->vaddr
= svc
->addr
;
772 dest
->vport
= svc
->port
;
773 dest
->vfwmark
= svc
->fwmark
;
774 dest
->addr
= udest
->addr
;
775 dest
->port
= udest
->port
;
777 atomic_set(&dest
->activeconns
, 0);
778 atomic_set(&dest
->inactconns
, 0);
779 atomic_set(&dest
->persistconns
, 0);
780 atomic_set(&dest
->refcnt
, 0);
782 INIT_LIST_HEAD(&dest
->d_list
);
783 spin_lock_init(&dest
->dst_lock
);
784 spin_lock_init(&dest
->stats
.lock
);
785 __ip_vs_update_dest(svc
, dest
, udest
);
786 ip_vs_new_estimator(&dest
->stats
);
796 * Add a destination into an existing service
799 ip_vs_add_dest(struct ip_vs_service
*svc
, struct ip_vs_dest_user
*udest
)
801 struct ip_vs_dest
*dest
;
802 __be32 daddr
= udest
->addr
;
803 __be16 dport
= udest
->port
;
808 if (udest
->weight
< 0) {
809 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
813 if (udest
->l_threshold
> udest
->u_threshold
) {
814 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
815 "upper threshold\n");
820 * Check if the dest already exists in the list
822 dest
= ip_vs_lookup_dest(svc
, daddr
, dport
);
824 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
829 * Check if the dest already exists in the trash and
830 * is from the same service
832 dest
= ip_vs_trash_get_dest(svc
, daddr
, dport
);
834 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
835 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
836 NIPQUAD(daddr
), ntohs(dport
),
837 atomic_read(&dest
->refcnt
),
839 NIPQUAD(dest
->vaddr
),
841 __ip_vs_update_dest(svc
, dest
, udest
);
844 * Get the destination from the trash
846 list_del(&dest
->n_list
);
848 ip_vs_new_estimator(&dest
->stats
);
850 write_lock_bh(&__ip_vs_svc_lock
);
853 * Wait until all other svc users go away.
855 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 1);
857 list_add(&dest
->n_list
, &svc
->destinations
);
860 /* call the update_service function of its scheduler */
861 svc
->scheduler
->update_service(svc
);
863 write_unlock_bh(&__ip_vs_svc_lock
);
868 * Allocate and initialize the dest structure
870 ret
= ip_vs_new_dest(svc
, udest
, &dest
);
876 * Add the dest entry into the list
878 atomic_inc(&dest
->refcnt
);
880 write_lock_bh(&__ip_vs_svc_lock
);
883 * Wait until all other svc users go away.
885 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 1);
887 list_add(&dest
->n_list
, &svc
->destinations
);
890 /* call the update_service function of its scheduler */
891 svc
->scheduler
->update_service(svc
);
893 write_unlock_bh(&__ip_vs_svc_lock
);
902 * Edit a destination in the given service
905 ip_vs_edit_dest(struct ip_vs_service
*svc
, struct ip_vs_dest_user
*udest
)
907 struct ip_vs_dest
*dest
;
908 __be32 daddr
= udest
->addr
;
909 __be16 dport
= udest
->port
;
913 if (udest
->weight
< 0) {
914 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
918 if (udest
->l_threshold
> udest
->u_threshold
) {
919 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
920 "upper threshold\n");
925 * Lookup the destination list
927 dest
= ip_vs_lookup_dest(svc
, daddr
, dport
);
929 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
933 __ip_vs_update_dest(svc
, dest
, udest
);
935 write_lock_bh(&__ip_vs_svc_lock
);
937 /* Wait until all other svc users go away */
938 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 1);
940 /* call the update_service, because server weight may be changed */
941 svc
->scheduler
->update_service(svc
);
943 write_unlock_bh(&__ip_vs_svc_lock
);
952 * Delete a destination (must be already unlinked from the service)
954 static void __ip_vs_del_dest(struct ip_vs_dest
*dest
)
956 ip_vs_kill_estimator(&dest
->stats
);
959 * Remove it from the d-linked list with the real services.
961 write_lock_bh(&__ip_vs_rs_lock
);
962 ip_vs_rs_unhash(dest
);
963 write_unlock_bh(&__ip_vs_rs_lock
);
966 * Decrease the refcnt of the dest, and free the dest
967 * if nobody refers to it (refcnt=0). Otherwise, throw
968 * the destination into the trash.
970 if (atomic_dec_and_test(&dest
->refcnt
)) {
971 ip_vs_dst_reset(dest
);
972 /* simply decrease svc->refcnt here, let the caller check
973 and release the service if nobody refers to it.
974 Only user context can release destination and service,
975 and only one user context can update virtual service at a
976 time, so the operation here is OK */
977 atomic_dec(&dest
->svc
->refcnt
);
980 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
982 NIPQUAD(dest
->addr
), ntohs(dest
->port
),
983 atomic_read(&dest
->refcnt
));
984 list_add(&dest
->n_list
, &ip_vs_dest_trash
);
985 atomic_inc(&dest
->refcnt
);
991 * Unlink a destination from the given service
993 static void __ip_vs_unlink_dest(struct ip_vs_service
*svc
,
994 struct ip_vs_dest
*dest
,
997 dest
->flags
&= ~IP_VS_DEST_F_AVAILABLE
;
1000 * Remove it from the d-linked destination list.
1002 list_del(&dest
->n_list
);
1006 * Call the update_service function of its scheduler
1008 svc
->scheduler
->update_service(svc
);
1014 * Delete a destination server in the given service
1017 ip_vs_del_dest(struct ip_vs_service
*svc
,struct ip_vs_dest_user
*udest
)
1019 struct ip_vs_dest
*dest
;
1020 __be32 daddr
= udest
->addr
;
1021 __be16 dport
= udest
->port
;
1025 dest
= ip_vs_lookup_dest(svc
, daddr
, dport
);
1027 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1031 write_lock_bh(&__ip_vs_svc_lock
);
1034 * Wait until all other svc users go away.
1036 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 1);
1039 * Unlink dest from the service
1041 __ip_vs_unlink_dest(svc
, dest
, 1);
1043 write_unlock_bh(&__ip_vs_svc_lock
);
1046 * Delete the destination
1048 __ip_vs_del_dest(dest
);
1057 * Add a service into the service hash table
1060 ip_vs_add_service(struct ip_vs_service_user
*u
, struct ip_vs_service
**svc_p
)
1063 struct ip_vs_scheduler
*sched
= NULL
;
1064 struct ip_vs_service
*svc
= NULL
;
1066 /* increase the module use count */
1067 ip_vs_use_count_inc();
1069 /* Lookup the scheduler by 'u->sched_name' */
1070 sched
= ip_vs_scheduler_get(u
->sched_name
);
1071 if (sched
== NULL
) {
1072 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1078 svc
= kzalloc(sizeof(struct ip_vs_service
), GFP_ATOMIC
);
1080 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1085 /* I'm the first user of the service */
1086 atomic_set(&svc
->usecnt
, 1);
1087 atomic_set(&svc
->refcnt
, 0);
1089 svc
->protocol
= u
->protocol
;
1090 svc
->addr
= u
->addr
;
1091 svc
->port
= u
->port
;
1092 svc
->fwmark
= u
->fwmark
;
1093 svc
->flags
= u
->flags
;
1094 svc
->timeout
= u
->timeout
* HZ
;
1095 svc
->netmask
= u
->netmask
;
1097 INIT_LIST_HEAD(&svc
->destinations
);
1098 rwlock_init(&svc
->sched_lock
);
1099 spin_lock_init(&svc
->stats
.lock
);
1101 /* Bind the scheduler */
1102 ret
= ip_vs_bind_scheduler(svc
, sched
);
1107 /* Update the virtual service counters */
1108 if (svc
->port
== FTPPORT
)
1109 atomic_inc(&ip_vs_ftpsvc_counter
);
1110 else if (svc
->port
== 0)
1111 atomic_inc(&ip_vs_nullsvc_counter
);
1113 ip_vs_new_estimator(&svc
->stats
);
1114 ip_vs_num_services
++;
1116 /* Hash the service into the service table */
1117 write_lock_bh(&__ip_vs_svc_lock
);
1118 ip_vs_svc_hash(svc
);
1119 write_unlock_bh(&__ip_vs_svc_lock
);
1127 ip_vs_unbind_scheduler(svc
);
1130 ip_vs_app_inc_put(svc
->inc
);
1135 ip_vs_scheduler_put(sched
);
1138 /* decrease the module use count */
1139 ip_vs_use_count_dec();
1146 * Edit a service and bind it with a new scheduler
1149 ip_vs_edit_service(struct ip_vs_service
*svc
, struct ip_vs_service_user
*u
)
1151 struct ip_vs_scheduler
*sched
, *old_sched
;
1155 * Lookup the scheduler, by 'u->sched_name'
1157 sched
= ip_vs_scheduler_get(u
->sched_name
);
1158 if (sched
== NULL
) {
1159 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1165 write_lock_bh(&__ip_vs_svc_lock
);
1168 * Wait until all other svc users go away.
1170 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 1);
1173 * Set the flags and timeout value
1175 svc
->flags
= u
->flags
| IP_VS_SVC_F_HASHED
;
1176 svc
->timeout
= u
->timeout
* HZ
;
1177 svc
->netmask
= u
->netmask
;
1179 old_sched
= svc
->scheduler
;
1180 if (sched
!= old_sched
) {
1182 * Unbind the old scheduler
1184 if ((ret
= ip_vs_unbind_scheduler(svc
))) {
1190 * Bind the new scheduler
1192 if ((ret
= ip_vs_bind_scheduler(svc
, sched
))) {
1194 * If ip_vs_bind_scheduler fails, restore the old
1196 * The main reason of failure is out of memory.
1198 * The question is if the old scheduler can be
1199 * restored all the time. TODO: if it cannot be
1200 * restored some time, we must delete the service,
1201 * otherwise the system may crash.
1203 ip_vs_bind_scheduler(svc
, old_sched
);
1210 write_unlock_bh(&__ip_vs_svc_lock
);
1213 ip_vs_scheduler_put(old_sched
);
1220 * Delete a service from the service list
1221 * - The service must be unlinked, unlocked and not referenced!
1222 * - We are called under _bh lock
1224 static void __ip_vs_del_service(struct ip_vs_service
*svc
)
1226 struct ip_vs_dest
*dest
, *nxt
;
1227 struct ip_vs_scheduler
*old_sched
;
1229 ip_vs_num_services
--;
1230 ip_vs_kill_estimator(&svc
->stats
);
1232 /* Unbind scheduler */
1233 old_sched
= svc
->scheduler
;
1234 ip_vs_unbind_scheduler(svc
);
1236 ip_vs_scheduler_put(old_sched
);
1238 /* Unbind app inc */
1240 ip_vs_app_inc_put(svc
->inc
);
1245 * Unlink the whole destination list
1247 list_for_each_entry_safe(dest
, nxt
, &svc
->destinations
, n_list
) {
1248 __ip_vs_unlink_dest(svc
, dest
, 0);
1249 __ip_vs_del_dest(dest
);
1253 * Update the virtual service counters
1255 if (svc
->port
== FTPPORT
)
1256 atomic_dec(&ip_vs_ftpsvc_counter
);
1257 else if (svc
->port
== 0)
1258 atomic_dec(&ip_vs_nullsvc_counter
);
1261 * Free the service if nobody refers to it
1263 if (atomic_read(&svc
->refcnt
) == 0)
1266 /* decrease the module use count */
1267 ip_vs_use_count_dec();
1271 * Delete a service from the service list
1273 static int ip_vs_del_service(struct ip_vs_service
*svc
)
1279 * Unhash it from the service table
1281 write_lock_bh(&__ip_vs_svc_lock
);
1283 ip_vs_svc_unhash(svc
);
1286 * Wait until all the svc users go away.
1288 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 1);
1290 __ip_vs_del_service(svc
);
1292 write_unlock_bh(&__ip_vs_svc_lock
);
1299 * Flush all the virtual services
1301 static int ip_vs_flush(void)
1304 struct ip_vs_service
*svc
, *nxt
;
1307 * Flush the service table hashed by <protocol,addr,port>
1309 for(idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
1310 list_for_each_entry_safe(svc
, nxt
, &ip_vs_svc_table
[idx
], s_list
) {
1311 write_lock_bh(&__ip_vs_svc_lock
);
1312 ip_vs_svc_unhash(svc
);
1314 * Wait until all the svc users go away.
1316 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 0);
1317 __ip_vs_del_service(svc
);
1318 write_unlock_bh(&__ip_vs_svc_lock
);
1323 * Flush the service table hashed by fwmark
1325 for(idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
1326 list_for_each_entry_safe(svc
, nxt
,
1327 &ip_vs_svc_fwm_table
[idx
], f_list
) {
1328 write_lock_bh(&__ip_vs_svc_lock
);
1329 ip_vs_svc_unhash(svc
);
1331 * Wait until all the svc users go away.
1333 IP_VS_WAIT_WHILE(atomic_read(&svc
->usecnt
) > 0);
1334 __ip_vs_del_service(svc
);
1335 write_unlock_bh(&__ip_vs_svc_lock
);
1344 * Zero counters in a service or all services
1346 static int ip_vs_zero_service(struct ip_vs_service
*svc
)
1348 struct ip_vs_dest
*dest
;
1350 write_lock_bh(&__ip_vs_svc_lock
);
1351 list_for_each_entry(dest
, &svc
->destinations
, n_list
) {
1352 ip_vs_zero_stats(&dest
->stats
);
1354 ip_vs_zero_stats(&svc
->stats
);
1355 write_unlock_bh(&__ip_vs_svc_lock
);
1359 static int ip_vs_zero_all(void)
1362 struct ip_vs_service
*svc
;
1364 for(idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
1365 list_for_each_entry(svc
, &ip_vs_svc_table
[idx
], s_list
) {
1366 ip_vs_zero_service(svc
);
1370 for(idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
1371 list_for_each_entry(svc
, &ip_vs_svc_fwm_table
[idx
], f_list
) {
1372 ip_vs_zero_service(svc
);
1376 ip_vs_zero_stats(&ip_vs_stats
);
1382 proc_do_defense_mode(ctl_table
*table
, int write
, struct file
* filp
,
1383 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
1385 int *valp
= table
->data
;
1389 rc
= proc_dointvec(table
, write
, filp
, buffer
, lenp
, ppos
);
1390 if (write
&& (*valp
!= val
)) {
1391 if ((*valp
< 0) || (*valp
> 3)) {
1392 /* Restore the correct value */
1395 update_defense_level();
1403 proc_do_sync_threshold(ctl_table
*table
, int write
, struct file
*filp
,
1404 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
1406 int *valp
= table
->data
;
1410 /* backup the value first */
1411 memcpy(val
, valp
, sizeof(val
));
1413 rc
= proc_dointvec(table
, write
, filp
, buffer
, lenp
, ppos
);
1414 if (write
&& (valp
[0] < 0 || valp
[1] < 0 || valp
[0] >= valp
[1])) {
1415 /* Restore the correct value */
1416 memcpy(valp
, val
, sizeof(val
));
1423 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1426 static struct ctl_table vs_vars
[] = {
1428 .ctl_name
= NET_IPV4_VS_AMEMTHRESH
,
1429 .procname
= "amemthresh",
1430 .data
= &sysctl_ip_vs_amemthresh
,
1431 .maxlen
= sizeof(int),
1433 .proc_handler
= &proc_dointvec
,
1435 #ifdef CONFIG_IP_VS_DEBUG
1437 .ctl_name
= NET_IPV4_VS_DEBUG_LEVEL
,
1438 .procname
= "debug_level",
1439 .data
= &sysctl_ip_vs_debug_level
,
1440 .maxlen
= sizeof(int),
1442 .proc_handler
= &proc_dointvec
,
1446 .ctl_name
= NET_IPV4_VS_AMDROPRATE
,
1447 .procname
= "am_droprate",
1448 .data
= &sysctl_ip_vs_am_droprate
,
1449 .maxlen
= sizeof(int),
1451 .proc_handler
= &proc_dointvec
,
1454 .ctl_name
= NET_IPV4_VS_DROP_ENTRY
,
1455 .procname
= "drop_entry",
1456 .data
= &sysctl_ip_vs_drop_entry
,
1457 .maxlen
= sizeof(int),
1459 .proc_handler
= &proc_do_defense_mode
,
1462 .ctl_name
= NET_IPV4_VS_DROP_PACKET
,
1463 .procname
= "drop_packet",
1464 .data
= &sysctl_ip_vs_drop_packet
,
1465 .maxlen
= sizeof(int),
1467 .proc_handler
= &proc_do_defense_mode
,
1470 .ctl_name
= NET_IPV4_VS_SECURE_TCP
,
1471 .procname
= "secure_tcp",
1472 .data
= &sysctl_ip_vs_secure_tcp
,
1473 .maxlen
= sizeof(int),
1475 .proc_handler
= &proc_do_defense_mode
,
1479 .ctl_name
= NET_IPV4_VS_TO_ES
,
1480 .procname
= "timeout_established",
1481 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_ESTABLISHED
],
1482 .maxlen
= sizeof(int),
1484 .proc_handler
= &proc_dointvec_jiffies
,
1487 .ctl_name
= NET_IPV4_VS_TO_SS
,
1488 .procname
= "timeout_synsent",
1489 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_SYN_SENT
],
1490 .maxlen
= sizeof(int),
1492 .proc_handler
= &proc_dointvec_jiffies
,
1495 .ctl_name
= NET_IPV4_VS_TO_SR
,
1496 .procname
= "timeout_synrecv",
1497 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_SYN_RECV
],
1498 .maxlen
= sizeof(int),
1500 .proc_handler
= &proc_dointvec_jiffies
,
1503 .ctl_name
= NET_IPV4_VS_TO_FW
,
1504 .procname
= "timeout_finwait",
1505 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_FIN_WAIT
],
1506 .maxlen
= sizeof(int),
1508 .proc_handler
= &proc_dointvec_jiffies
,
1511 .ctl_name
= NET_IPV4_VS_TO_TW
,
1512 .procname
= "timeout_timewait",
1513 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_TIME_WAIT
],
1514 .maxlen
= sizeof(int),
1516 .proc_handler
= &proc_dointvec_jiffies
,
1519 .ctl_name
= NET_IPV4_VS_TO_CL
,
1520 .procname
= "timeout_close",
1521 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_CLOSE
],
1522 .maxlen
= sizeof(int),
1524 .proc_handler
= &proc_dointvec_jiffies
,
1527 .ctl_name
= NET_IPV4_VS_TO_CW
,
1528 .procname
= "timeout_closewait",
1529 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_CLOSE_WAIT
],
1530 .maxlen
= sizeof(int),
1532 .proc_handler
= &proc_dointvec_jiffies
,
1535 .ctl_name
= NET_IPV4_VS_TO_LA
,
1536 .procname
= "timeout_lastack",
1537 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_LAST_ACK
],
1538 .maxlen
= sizeof(int),
1540 .proc_handler
= &proc_dointvec_jiffies
,
1543 .ctl_name
= NET_IPV4_VS_TO_LI
,
1544 .procname
= "timeout_listen",
1545 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_LISTEN
],
1546 .maxlen
= sizeof(int),
1548 .proc_handler
= &proc_dointvec_jiffies
,
1551 .ctl_name
= NET_IPV4_VS_TO_SA
,
1552 .procname
= "timeout_synack",
1553 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_SYNACK
],
1554 .maxlen
= sizeof(int),
1556 .proc_handler
= &proc_dointvec_jiffies
,
1559 .ctl_name
= NET_IPV4_VS_TO_UDP
,
1560 .procname
= "timeout_udp",
1561 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_UDP
],
1562 .maxlen
= sizeof(int),
1564 .proc_handler
= &proc_dointvec_jiffies
,
1567 .ctl_name
= NET_IPV4_VS_TO_ICMP
,
1568 .procname
= "timeout_icmp",
1569 .data
= &vs_timeout_table_dos
.timeout
[IP_VS_S_ICMP
],
1570 .maxlen
= sizeof(int),
1572 .proc_handler
= &proc_dointvec_jiffies
,
1576 .ctl_name
= NET_IPV4_VS_CACHE_BYPASS
,
1577 .procname
= "cache_bypass",
1578 .data
= &sysctl_ip_vs_cache_bypass
,
1579 .maxlen
= sizeof(int),
1581 .proc_handler
= &proc_dointvec
,
1584 .ctl_name
= NET_IPV4_VS_EXPIRE_NODEST_CONN
,
1585 .procname
= "expire_nodest_conn",
1586 .data
= &sysctl_ip_vs_expire_nodest_conn
,
1587 .maxlen
= sizeof(int),
1589 .proc_handler
= &proc_dointvec
,
1592 .ctl_name
= NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE
,
1593 .procname
= "expire_quiescent_template",
1594 .data
= &sysctl_ip_vs_expire_quiescent_template
,
1595 .maxlen
= sizeof(int),
1597 .proc_handler
= &proc_dointvec
,
1600 .ctl_name
= NET_IPV4_VS_SYNC_THRESHOLD
,
1601 .procname
= "sync_threshold",
1602 .data
= &sysctl_ip_vs_sync_threshold
,
1603 .maxlen
= sizeof(sysctl_ip_vs_sync_threshold
),
1605 .proc_handler
= &proc_do_sync_threshold
,
1608 .ctl_name
= NET_IPV4_VS_NAT_ICMP_SEND
,
1609 .procname
= "nat_icmp_send",
1610 .data
= &sysctl_ip_vs_nat_icmp_send
,
1611 .maxlen
= sizeof(int),
1613 .proc_handler
= &proc_dointvec
,
1618 static ctl_table vs_table
[] = {
1620 .ctl_name
= NET_IPV4_VS
,
1628 static ctl_table ipvs_ipv4_table
[] = {
1630 .ctl_name
= NET_IPV4
,
1638 static ctl_table vs_root_table
[] = {
1640 .ctl_name
= CTL_NET
,
1643 .child
= ipvs_ipv4_table
,
1648 static struct ctl_table_header
* sysctl_header
;
1650 #ifdef CONFIG_PROC_FS
1653 struct list_head
*table
;
1658 * Write the contents of the VS rule table to a PROCfs file.
1659 * (It is kept just for backward compatibility)
1661 static inline const char *ip_vs_fwd_name(unsigned flags
)
1663 switch (flags
& IP_VS_CONN_F_FWD_MASK
) {
1664 case IP_VS_CONN_F_LOCALNODE
:
1666 case IP_VS_CONN_F_TUNNEL
:
1668 case IP_VS_CONN_F_DROUTE
:
1676 /* Get the Nth entry in the two lists */
1677 static struct ip_vs_service
*ip_vs_info_array(struct seq_file
*seq
, loff_t pos
)
1679 struct ip_vs_iter
*iter
= seq
->private;
1681 struct ip_vs_service
*svc
;
1683 /* look in hash by protocol */
1684 for (idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
1685 list_for_each_entry(svc
, &ip_vs_svc_table
[idx
], s_list
) {
1687 iter
->table
= ip_vs_svc_table
;
1694 /* keep looking in fwmark */
1695 for (idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
1696 list_for_each_entry(svc
, &ip_vs_svc_fwm_table
[idx
], f_list
) {
1698 iter
->table
= ip_vs_svc_fwm_table
;
1708 static void *ip_vs_info_seq_start(struct seq_file
*seq
, loff_t
*pos
)
1711 read_lock_bh(&__ip_vs_svc_lock
);
1712 return *pos
? ip_vs_info_array(seq
, *pos
- 1) : SEQ_START_TOKEN
;
1716 static void *ip_vs_info_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
1718 struct list_head
*e
;
1719 struct ip_vs_iter
*iter
;
1720 struct ip_vs_service
*svc
;
1723 if (v
== SEQ_START_TOKEN
)
1724 return ip_vs_info_array(seq
,0);
1727 iter
= seq
->private;
1729 if (iter
->table
== ip_vs_svc_table
) {
1730 /* next service in table hashed by protocol */
1731 if ((e
= svc
->s_list
.next
) != &ip_vs_svc_table
[iter
->bucket
])
1732 return list_entry(e
, struct ip_vs_service
, s_list
);
1735 while (++iter
->bucket
< IP_VS_SVC_TAB_SIZE
) {
1736 list_for_each_entry(svc
,&ip_vs_svc_table
[iter
->bucket
],
1742 iter
->table
= ip_vs_svc_fwm_table
;
1747 /* next service in hashed by fwmark */
1748 if ((e
= svc
->f_list
.next
) != &ip_vs_svc_fwm_table
[iter
->bucket
])
1749 return list_entry(e
, struct ip_vs_service
, f_list
);
1752 while (++iter
->bucket
< IP_VS_SVC_TAB_SIZE
) {
1753 list_for_each_entry(svc
, &ip_vs_svc_fwm_table
[iter
->bucket
],
1761 static void ip_vs_info_seq_stop(struct seq_file
*seq
, void *v
)
1763 read_unlock_bh(&__ip_vs_svc_lock
);
1767 static int ip_vs_info_seq_show(struct seq_file
*seq
, void *v
)
1769 if (v
== SEQ_START_TOKEN
) {
1771 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1772 NVERSION(IP_VS_VERSION_CODE
), IP_VS_CONN_TAB_SIZE
);
1774 "Prot LocalAddress:Port Scheduler Flags\n");
1776 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1778 const struct ip_vs_service
*svc
= v
;
1779 const struct ip_vs_iter
*iter
= seq
->private;
1780 const struct ip_vs_dest
*dest
;
1782 if (iter
->table
== ip_vs_svc_table
)
1783 seq_printf(seq
, "%s %08X:%04X %s ",
1784 ip_vs_proto_name(svc
->protocol
),
1787 svc
->scheduler
->name
);
1789 seq_printf(seq
, "FWM %08X %s ",
1790 svc
->fwmark
, svc
->scheduler
->name
);
1792 if (svc
->flags
& IP_VS_SVC_F_PERSISTENT
)
1793 seq_printf(seq
, "persistent %d %08X\n",
1795 ntohl(svc
->netmask
));
1797 seq_putc(seq
, '\n');
1799 list_for_each_entry(dest
, &svc
->destinations
, n_list
) {
1801 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1802 ntohl(dest
->addr
), ntohs(dest
->port
),
1803 ip_vs_fwd_name(atomic_read(&dest
->conn_flags
)),
1804 atomic_read(&dest
->weight
),
1805 atomic_read(&dest
->activeconns
),
1806 atomic_read(&dest
->inactconns
));
1812 static const struct seq_operations ip_vs_info_seq_ops
= {
1813 .start
= ip_vs_info_seq_start
,
1814 .next
= ip_vs_info_seq_next
,
1815 .stop
= ip_vs_info_seq_stop
,
1816 .show
= ip_vs_info_seq_show
,
1819 static int ip_vs_info_open(struct inode
*inode
, struct file
*file
)
1821 return seq_open_private(file
, &ip_vs_info_seq_ops
,
1822 sizeof(struct ip_vs_iter
));
1825 static const struct file_operations ip_vs_info_fops
= {
1826 .owner
= THIS_MODULE
,
1827 .open
= ip_vs_info_open
,
1829 .llseek
= seq_lseek
,
1830 .release
= seq_release_private
,
1835 struct ip_vs_stats ip_vs_stats
;
1837 #ifdef CONFIG_PROC_FS
1838 static int ip_vs_stats_show(struct seq_file
*seq
, void *v
)
1841 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1843 " Total Incoming Outgoing Incoming Outgoing\n");
1845 " Conns Packets Packets Bytes Bytes\n");
1847 spin_lock_bh(&ip_vs_stats
.lock
);
1848 seq_printf(seq
, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats
.conns
,
1849 ip_vs_stats
.inpkts
, ip_vs_stats
.outpkts
,
1850 (unsigned long long) ip_vs_stats
.inbytes
,
1851 (unsigned long long) ip_vs_stats
.outbytes
);
1853 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1855 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1856 seq_printf(seq
,"%8X %8X %8X %16X %16X\n",
1861 ip_vs_stats
.outbps
);
1862 spin_unlock_bh(&ip_vs_stats
.lock
);
1867 static int ip_vs_stats_seq_open(struct inode
*inode
, struct file
*file
)
1869 return single_open(file
, ip_vs_stats_show
, NULL
);
1872 static const struct file_operations ip_vs_stats_fops
= {
1873 .owner
= THIS_MODULE
,
1874 .open
= ip_vs_stats_seq_open
,
1876 .llseek
= seq_lseek
,
1877 .release
= single_release
,
1883 * Set timeout values for tcp tcpfin udp in the timeout_table.
1885 static int ip_vs_set_timeout(struct ip_vs_timeout_user
*u
)
1887 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1892 #ifdef CONFIG_IP_VS_PROTO_TCP
1893 if (u
->tcp_timeout
) {
1894 ip_vs_protocol_tcp
.timeout_table
[IP_VS_TCP_S_ESTABLISHED
]
1895 = u
->tcp_timeout
* HZ
;
1898 if (u
->tcp_fin_timeout
) {
1899 ip_vs_protocol_tcp
.timeout_table
[IP_VS_TCP_S_FIN_WAIT
]
1900 = u
->tcp_fin_timeout
* HZ
;
1904 #ifdef CONFIG_IP_VS_PROTO_UDP
1905 if (u
->udp_timeout
) {
1906 ip_vs_protocol_udp
.timeout_table
[IP_VS_UDP_S_NORMAL
]
1907 = u
->udp_timeout
* HZ
;
1914 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1915 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1916 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1917 sizeof(struct ip_vs_dest_user))
1918 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1919 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1920 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1922 static const unsigned char set_arglen
[SET_CMDID(IP_VS_SO_SET_MAX
)+1] = {
1923 [SET_CMDID(IP_VS_SO_SET_ADD
)] = SERVICE_ARG_LEN
,
1924 [SET_CMDID(IP_VS_SO_SET_EDIT
)] = SERVICE_ARG_LEN
,
1925 [SET_CMDID(IP_VS_SO_SET_DEL
)] = SERVICE_ARG_LEN
,
1926 [SET_CMDID(IP_VS_SO_SET_FLUSH
)] = 0,
1927 [SET_CMDID(IP_VS_SO_SET_ADDDEST
)] = SVCDEST_ARG_LEN
,
1928 [SET_CMDID(IP_VS_SO_SET_DELDEST
)] = SVCDEST_ARG_LEN
,
1929 [SET_CMDID(IP_VS_SO_SET_EDITDEST
)] = SVCDEST_ARG_LEN
,
1930 [SET_CMDID(IP_VS_SO_SET_TIMEOUT
)] = TIMEOUT_ARG_LEN
,
1931 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON
)] = DAEMON_ARG_LEN
,
1932 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON
)] = DAEMON_ARG_LEN
,
1933 [SET_CMDID(IP_VS_SO_SET_ZERO
)] = SERVICE_ARG_LEN
,
1937 do_ip_vs_set_ctl(struct sock
*sk
, int cmd
, void __user
*user
, unsigned int len
)
1940 unsigned char arg
[MAX_ARG_LEN
];
1941 struct ip_vs_service_user
*usvc
;
1942 struct ip_vs_service
*svc
;
1943 struct ip_vs_dest_user
*udest
;
1945 if (!capable(CAP_NET_ADMIN
))
1948 if (len
!= set_arglen
[SET_CMDID(cmd
)]) {
1949 IP_VS_ERR("set_ctl: len %u != %u\n",
1950 len
, set_arglen
[SET_CMDID(cmd
)]);
1954 if (copy_from_user(arg
, user
, len
) != 0)
1957 /* increase the module use count */
1958 ip_vs_use_count_inc();
1960 if (mutex_lock_interruptible(&__ip_vs_mutex
)) {
1965 if (cmd
== IP_VS_SO_SET_FLUSH
) {
1966 /* Flush the virtual service */
1967 ret
= ip_vs_flush();
1969 } else if (cmd
== IP_VS_SO_SET_TIMEOUT
) {
1970 /* Set timeout values for (tcp tcpfin udp) */
1971 ret
= ip_vs_set_timeout((struct ip_vs_timeout_user
*)arg
);
1973 } else if (cmd
== IP_VS_SO_SET_STARTDAEMON
) {
1974 struct ip_vs_daemon_user
*dm
= (struct ip_vs_daemon_user
*)arg
;
1975 ret
= start_sync_thread(dm
->state
, dm
->mcast_ifn
, dm
->syncid
);
1977 } else if (cmd
== IP_VS_SO_SET_STOPDAEMON
) {
1978 struct ip_vs_daemon_user
*dm
= (struct ip_vs_daemon_user
*)arg
;
1979 ret
= stop_sync_thread(dm
->state
);
1983 usvc
= (struct ip_vs_service_user
*)arg
;
1984 udest
= (struct ip_vs_dest_user
*)(usvc
+ 1);
1986 if (cmd
== IP_VS_SO_SET_ZERO
) {
1987 /* if no service address is set, zero counters in all */
1988 if (!usvc
->fwmark
&& !usvc
->addr
&& !usvc
->port
) {
1989 ret
= ip_vs_zero_all();
1994 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1995 if (usvc
->protocol
!=IPPROTO_TCP
&& usvc
->protocol
!=IPPROTO_UDP
) {
1996 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1997 usvc
->protocol
, NIPQUAD(usvc
->addr
),
1998 ntohs(usvc
->port
), usvc
->sched_name
);
2003 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2004 if (usvc
->fwmark
== 0)
2005 svc
= __ip_vs_service_get(usvc
->protocol
,
2006 usvc
->addr
, usvc
->port
);
2008 svc
= __ip_vs_svc_fwm_get(usvc
->fwmark
);
2010 if (cmd
!= IP_VS_SO_SET_ADD
2011 && (svc
== NULL
|| svc
->protocol
!= usvc
->protocol
)) {
2017 case IP_VS_SO_SET_ADD
:
2021 ret
= ip_vs_add_service(usvc
, &svc
);
2023 case IP_VS_SO_SET_EDIT
:
2024 ret
= ip_vs_edit_service(svc
, usvc
);
2026 case IP_VS_SO_SET_DEL
:
2027 ret
= ip_vs_del_service(svc
);
2031 case IP_VS_SO_SET_ZERO
:
2032 ret
= ip_vs_zero_service(svc
);
2034 case IP_VS_SO_SET_ADDDEST
:
2035 ret
= ip_vs_add_dest(svc
, udest
);
2037 case IP_VS_SO_SET_EDITDEST
:
2038 ret
= ip_vs_edit_dest(svc
, udest
);
2040 case IP_VS_SO_SET_DELDEST
:
2041 ret
= ip_vs_del_dest(svc
, udest
);
2048 ip_vs_service_put(svc
);
2051 mutex_unlock(&__ip_vs_mutex
);
2053 /* decrease the module use count */
2054 ip_vs_use_count_dec();
2061 ip_vs_copy_stats(struct ip_vs_stats_user
*dst
, struct ip_vs_stats
*src
)
2063 spin_lock_bh(&src
->lock
);
2064 memcpy(dst
, src
, (char*)&src
->lock
- (char*)src
);
2065 spin_unlock_bh(&src
->lock
);
2069 ip_vs_copy_service(struct ip_vs_service_entry
*dst
, struct ip_vs_service
*src
)
2071 dst
->protocol
= src
->protocol
;
2072 dst
->addr
= src
->addr
;
2073 dst
->port
= src
->port
;
2074 dst
->fwmark
= src
->fwmark
;
2075 strlcpy(dst
->sched_name
, src
->scheduler
->name
, sizeof(dst
->sched_name
));
2076 dst
->flags
= src
->flags
;
2077 dst
->timeout
= src
->timeout
/ HZ
;
2078 dst
->netmask
= src
->netmask
;
2079 dst
->num_dests
= src
->num_dests
;
2080 ip_vs_copy_stats(&dst
->stats
, &src
->stats
);
2084 __ip_vs_get_service_entries(const struct ip_vs_get_services
*get
,
2085 struct ip_vs_get_services __user
*uptr
)
2088 struct ip_vs_service
*svc
;
2089 struct ip_vs_service_entry entry
;
2092 for (idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
2093 list_for_each_entry(svc
, &ip_vs_svc_table
[idx
], s_list
) {
2094 if (count
>= get
->num_services
)
2096 memset(&entry
, 0, sizeof(entry
));
2097 ip_vs_copy_service(&entry
, svc
);
2098 if (copy_to_user(&uptr
->entrytable
[count
],
2099 &entry
, sizeof(entry
))) {
2107 for (idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
2108 list_for_each_entry(svc
, &ip_vs_svc_fwm_table
[idx
], f_list
) {
2109 if (count
>= get
->num_services
)
2111 memset(&entry
, 0, sizeof(entry
));
2112 ip_vs_copy_service(&entry
, svc
);
2113 if (copy_to_user(&uptr
->entrytable
[count
],
2114 &entry
, sizeof(entry
))) {
2126 __ip_vs_get_dest_entries(const struct ip_vs_get_dests
*get
,
2127 struct ip_vs_get_dests __user
*uptr
)
2129 struct ip_vs_service
*svc
;
2133 svc
= __ip_vs_svc_fwm_get(get
->fwmark
);
2135 svc
= __ip_vs_service_get(get
->protocol
,
2136 get
->addr
, get
->port
);
2139 struct ip_vs_dest
*dest
;
2140 struct ip_vs_dest_entry entry
;
2142 list_for_each_entry(dest
, &svc
->destinations
, n_list
) {
2143 if (count
>= get
->num_dests
)
2146 entry
.addr
= dest
->addr
;
2147 entry
.port
= dest
->port
;
2148 entry
.conn_flags
= atomic_read(&dest
->conn_flags
);
2149 entry
.weight
= atomic_read(&dest
->weight
);
2150 entry
.u_threshold
= dest
->u_threshold
;
2151 entry
.l_threshold
= dest
->l_threshold
;
2152 entry
.activeconns
= atomic_read(&dest
->activeconns
);
2153 entry
.inactconns
= atomic_read(&dest
->inactconns
);
2154 entry
.persistconns
= atomic_read(&dest
->persistconns
);
2155 ip_vs_copy_stats(&entry
.stats
, &dest
->stats
);
2156 if (copy_to_user(&uptr
->entrytable
[count
],
2157 &entry
, sizeof(entry
))) {
2163 ip_vs_service_put(svc
);
2170 __ip_vs_get_timeouts(struct ip_vs_timeout_user
*u
)
2172 #ifdef CONFIG_IP_VS_PROTO_TCP
2174 ip_vs_protocol_tcp
.timeout_table
[IP_VS_TCP_S_ESTABLISHED
] / HZ
;
2175 u
->tcp_fin_timeout
=
2176 ip_vs_protocol_tcp
.timeout_table
[IP_VS_TCP_S_FIN_WAIT
] / HZ
;
2178 #ifdef CONFIG_IP_VS_PROTO_UDP
2180 ip_vs_protocol_udp
.timeout_table
[IP_VS_UDP_S_NORMAL
] / HZ
;
2185 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2186 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2187 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2188 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2189 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2190 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2191 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2193 static const unsigned char get_arglen
[GET_CMDID(IP_VS_SO_GET_MAX
)+1] = {
2194 [GET_CMDID(IP_VS_SO_GET_VERSION
)] = 64,
2195 [GET_CMDID(IP_VS_SO_GET_INFO
)] = GET_INFO_ARG_LEN
,
2196 [GET_CMDID(IP_VS_SO_GET_SERVICES
)] = GET_SERVICES_ARG_LEN
,
2197 [GET_CMDID(IP_VS_SO_GET_SERVICE
)] = GET_SERVICE_ARG_LEN
,
2198 [GET_CMDID(IP_VS_SO_GET_DESTS
)] = GET_DESTS_ARG_LEN
,
2199 [GET_CMDID(IP_VS_SO_GET_TIMEOUT
)] = GET_TIMEOUT_ARG_LEN
,
2200 [GET_CMDID(IP_VS_SO_GET_DAEMON
)] = GET_DAEMON_ARG_LEN
,
2204 do_ip_vs_get_ctl(struct sock
*sk
, int cmd
, void __user
*user
, int *len
)
2206 unsigned char arg
[128];
2209 if (!capable(CAP_NET_ADMIN
))
2212 if (*len
< get_arglen
[GET_CMDID(cmd
)]) {
2213 IP_VS_ERR("get_ctl: len %u < %u\n",
2214 *len
, get_arglen
[GET_CMDID(cmd
)]);
2218 if (copy_from_user(arg
, user
, get_arglen
[GET_CMDID(cmd
)]) != 0)
2221 if (mutex_lock_interruptible(&__ip_vs_mutex
))
2222 return -ERESTARTSYS
;
2225 case IP_VS_SO_GET_VERSION
:
2229 sprintf(buf
, "IP Virtual Server version %d.%d.%d (size=%d)",
2230 NVERSION(IP_VS_VERSION_CODE
), IP_VS_CONN_TAB_SIZE
);
2231 if (copy_to_user(user
, buf
, strlen(buf
)+1) != 0) {
2235 *len
= strlen(buf
)+1;
2239 case IP_VS_SO_GET_INFO
:
2241 struct ip_vs_getinfo info
;
2242 info
.version
= IP_VS_VERSION_CODE
;
2243 info
.size
= IP_VS_CONN_TAB_SIZE
;
2244 info
.num_services
= ip_vs_num_services
;
2245 if (copy_to_user(user
, &info
, sizeof(info
)) != 0)
2250 case IP_VS_SO_GET_SERVICES
:
2252 struct ip_vs_get_services
*get
;
2255 get
= (struct ip_vs_get_services
*)arg
;
2256 size
= sizeof(*get
) +
2257 sizeof(struct ip_vs_service_entry
) * get
->num_services
;
2259 IP_VS_ERR("length: %u != %u\n", *len
, size
);
2263 ret
= __ip_vs_get_service_entries(get
, user
);
2267 case IP_VS_SO_GET_SERVICE
:
2269 struct ip_vs_service_entry
*entry
;
2270 struct ip_vs_service
*svc
;
2272 entry
= (struct ip_vs_service_entry
*)arg
;
2274 svc
= __ip_vs_svc_fwm_get(entry
->fwmark
);
2276 svc
= __ip_vs_service_get(entry
->protocol
,
2277 entry
->addr
, entry
->port
);
2279 ip_vs_copy_service(entry
, svc
);
2280 if (copy_to_user(user
, entry
, sizeof(*entry
)) != 0)
2282 ip_vs_service_put(svc
);
2288 case IP_VS_SO_GET_DESTS
:
2290 struct ip_vs_get_dests
*get
;
2293 get
= (struct ip_vs_get_dests
*)arg
;
2294 size
= sizeof(*get
) +
2295 sizeof(struct ip_vs_dest_entry
) * get
->num_dests
;
2297 IP_VS_ERR("length: %u != %u\n", *len
, size
);
2301 ret
= __ip_vs_get_dest_entries(get
, user
);
2305 case IP_VS_SO_GET_TIMEOUT
:
2307 struct ip_vs_timeout_user t
;
2309 __ip_vs_get_timeouts(&t
);
2310 if (copy_to_user(user
, &t
, sizeof(t
)) != 0)
2315 case IP_VS_SO_GET_DAEMON
:
2317 struct ip_vs_daemon_user d
[2];
2319 memset(&d
, 0, sizeof(d
));
2320 if (ip_vs_sync_state
& IP_VS_STATE_MASTER
) {
2321 d
[0].state
= IP_VS_STATE_MASTER
;
2322 strlcpy(d
[0].mcast_ifn
, ip_vs_master_mcast_ifn
, sizeof(d
[0].mcast_ifn
));
2323 d
[0].syncid
= ip_vs_master_syncid
;
2325 if (ip_vs_sync_state
& IP_VS_STATE_BACKUP
) {
2326 d
[1].state
= IP_VS_STATE_BACKUP
;
2327 strlcpy(d
[1].mcast_ifn
, ip_vs_backup_mcast_ifn
, sizeof(d
[1].mcast_ifn
));
2328 d
[1].syncid
= ip_vs_backup_syncid
;
2330 if (copy_to_user(user
, &d
, sizeof(d
)) != 0)
2340 mutex_unlock(&__ip_vs_mutex
);
2345 static struct nf_sockopt_ops ip_vs_sockopts
= {
2347 .set_optmin
= IP_VS_BASE_CTL
,
2348 .set_optmax
= IP_VS_SO_SET_MAX
+1,
2349 .set
= do_ip_vs_set_ctl
,
2350 .get_optmin
= IP_VS_BASE_CTL
,
2351 .get_optmax
= IP_VS_SO_GET_MAX
+1,
2352 .get
= do_ip_vs_get_ctl
,
2353 .owner
= THIS_MODULE
,
2357 int ip_vs_control_init(void)
2364 ret
= nf_register_sockopt(&ip_vs_sockopts
);
2366 IP_VS_ERR("cannot register sockopt.\n");
2370 proc_net_fops_create(&init_net
, "ip_vs", 0, &ip_vs_info_fops
);
2371 proc_net_fops_create(&init_net
, "ip_vs_stats",0, &ip_vs_stats_fops
);
2373 sysctl_header
= register_sysctl_table(vs_root_table
);
2375 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2376 for(idx
= 0; idx
< IP_VS_SVC_TAB_SIZE
; idx
++) {
2377 INIT_LIST_HEAD(&ip_vs_svc_table
[idx
]);
2378 INIT_LIST_HEAD(&ip_vs_svc_fwm_table
[idx
]);
2380 for(idx
= 0; idx
< IP_VS_RTAB_SIZE
; idx
++) {
2381 INIT_LIST_HEAD(&ip_vs_rtable
[idx
]);
2384 memset(&ip_vs_stats
, 0, sizeof(ip_vs_stats
));
2385 spin_lock_init(&ip_vs_stats
.lock
);
2386 ip_vs_new_estimator(&ip_vs_stats
);
2388 /* Hook the defense timer */
2389 schedule_delayed_work(&defense_work
, DEFENSE_TIMER_PERIOD
);
2396 void ip_vs_control_cleanup(void)
2399 ip_vs_trash_cleanup();
2400 cancel_rearming_delayed_work(&defense_work
);
2401 cancel_work_sync(&defense_work
.work
);
2402 ip_vs_kill_estimator(&ip_vs_stats
);
2403 unregister_sysctl_table(sysctl_header
);
2404 proc_net_remove(&init_net
, "ip_vs_stats");
2405 proc_net_remove(&init_net
, "ip_vs");
2406 nf_unregister_sockopt(&ip_vs_sockopts
);