2 * This is a module which is used for queueing IPv4 packets and
3 * communicating with userspace via netlink.
5 * (C) 2000 James Morris, this code is GPL.
7 * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). (JM)
8 * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). (JM)
9 * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
13 #include <linux/module.h>
14 #include <linux/skbuff.h>
15 #include <linux/init.h>
17 #include <linux/notifier.h>
18 #include <linux/netdevice.h>
19 #include <linux/netfilter.h>
20 #include <linux/netlink.h>
21 #include <linux/spinlock.h>
22 #include <linux/rtnetlink.h>
23 #include <linux/sysctl.h>
24 #include <linux/proc_fs.h>
27 #include <linux/netfilter_ipv4/ip_queue.h>
29 #define IPQ_QMAX_DEFAULT 1024
30 #define IPQ_PROC_FS_NAME "ip_queue"
31 #define NET_IPQ_QMAX 2088
32 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
34 typedef struct ipq_queue_element
{
35 struct list_head list
; /* Links element into queue */
36 int verdict
; /* Current verdict */
37 struct nf_info
*info
; /* Extra info from netfilter */
38 struct sk_buff
*skb
; /* Packet inside */
39 } ipq_queue_element_t
;
41 typedef int (*ipq_send_cb_t
)(ipq_queue_element_t
*e
);
43 typedef struct ipq_peer
{
44 pid_t pid
; /* PID of userland peer */
45 unsigned char died
; /* We think the peer died */
46 unsigned char copy_mode
; /* Copy packet as well as metadata? */
47 size_t copy_range
; /* Range past metadata to copy */
48 ipq_send_cb_t send
; /* Callback for sending data to peer */
51 typedef struct ipq_queue
{
52 int len
; /* Current queue len */
53 int *maxlen
; /* Maximum queue len, via sysctl */
54 unsigned char flushing
; /* If queue is being flushed */
55 unsigned char terminate
; /* If the queue is being terminated */
56 struct list_head list
; /* Head of packet queue */
57 spinlock_t lock
; /* Queue spinlock */
58 ipq_peer_t peer
; /* Userland peer */
61 /****************************************************************************
65 ****************************************************************************/
67 /* Dequeue a packet if matched by cmp, or the next available if cmp is NULL */
68 static ipq_queue_element_t
*
69 ipq_dequeue(ipq_queue_t
*q
,
70 int (*cmp
)(ipq_queue_element_t
*, unsigned long),
75 spin_lock_bh(&q
->lock
);
76 for (i
= q
->list
.prev
; i
!= &q
->list
; i
= i
->prev
) {
77 ipq_queue_element_t
*e
= (ipq_queue_element_t
*)i
;
79 if (!cmp
|| cmp(e
, data
)) {
82 spin_unlock_bh(&q
->lock
);
86 spin_unlock_bh(&q
->lock
);
90 /* Flush all packets */
91 static void ipq_flush(ipq_queue_t
*q
)
93 ipq_queue_element_t
*e
;
95 spin_lock_bh(&q
->lock
);
97 spin_unlock_bh(&q
->lock
);
98 while ((e
= ipq_dequeue(q
, NULL
, 0))) {
100 nf_reinject(e
->skb
, e
->info
, e
->verdict
);
103 spin_lock_bh(&q
->lock
);
105 spin_unlock_bh(&q
->lock
);
108 static ipq_queue_t
*ipq_create_queue(nf_queue_outfn_t outfn
,
109 ipq_send_cb_t send_cb
,
110 int *errp
, int *sysctl_qmax
)
116 q
= kmalloc(sizeof(ipq_queue_t
), GFP_KERNEL
);
123 q
->peer
.copy_mode
= IPQ_COPY_NONE
;
124 q
->peer
.copy_range
= 0;
125 q
->peer
.send
= send_cb
;
127 q
->maxlen
= sysctl_qmax
;
130 INIT_LIST_HEAD(&q
->list
);
131 spin_lock_init(&q
->lock
);
132 status
= nf_register_queue_handler(PF_INET
, outfn
, q
);
141 static int ipq_enqueue(ipq_queue_t
*q
,
142 struct sk_buff
*skb
, struct nf_info
*info
)
144 ipq_queue_element_t
*e
;
147 e
= kmalloc(sizeof(*e
), GFP_ATOMIC
);
149 printk(KERN_ERR
"ip_queue: OOM in enqueue\n");
152 e
->verdict
= NF_DROP
;
155 spin_lock_bh(&q
->lock
);
156 if (q
->len
>= *q
->maxlen
) {
157 spin_unlock_bh(&q
->lock
);
159 printk(KERN_WARNING
"ip_queue: full at %d entries, "
160 "dropping packet(s).\n", q
->len
);
163 if (q
->flushing
|| q
->peer
.copy_mode
== IPQ_COPY_NONE
164 || q
->peer
.pid
== 0 || q
->peer
.died
|| q
->terminate
) {
165 spin_unlock_bh(&q
->lock
);
168 status
= q
->peer
.send(e
);
170 list_add(&e
->list
, &q
->list
);
172 spin_unlock_bh(&q
->lock
);
175 spin_unlock_bh(&q
->lock
);
176 if (status
== -ECONNREFUSED
) {
177 printk(KERN_INFO
"ip_queue: peer %d died, "
178 "resetting state and flushing queue\n", q
->peer
.pid
);
181 q
->peer
.copy_mode
= IPQ_COPY_NONE
;
182 q
->peer
.copy_range
= 0;
190 static void ipq_destroy_queue(ipq_queue_t
*q
)
192 nf_unregister_queue_handler(PF_INET
);
193 spin_lock_bh(&q
->lock
);
195 spin_unlock_bh(&q
->lock
);
200 static int ipq_mangle_ipv4(ipq_verdict_msg_t
*v
, ipq_queue_element_t
*e
)
203 struct iphdr
*user_iph
= (struct iphdr
*)v
->payload
;
205 if (v
->data_len
< sizeof(*user_iph
))
207 diff
= v
->data_len
- e
->skb
->len
;
209 skb_trim(e
->skb
, v
->data_len
);
211 if (v
->data_len
> 0xFFFF)
213 if (diff
> skb_tailroom(e
->skb
)) {
214 struct sk_buff
*newskb
;
216 newskb
= skb_copy_expand(e
->skb
,
217 skb_headroom(e
->skb
),
220 if (newskb
== NULL
) {
221 printk(KERN_WARNING
"ip_queue: OOM "
222 "in mangle, dropping packet\n");
228 skb_put(e
->skb
, diff
);
230 memcpy(e
->skb
->data
, v
->payload
, v
->data_len
);
231 e
->skb
->nfcache
|= NFC_ALTERED
;
235 static inline int id_cmp(ipq_queue_element_t
*e
, unsigned long id
)
237 return (id
== (unsigned long )e
);
240 static int ipq_set_verdict(ipq_queue_t
*q
,
241 ipq_verdict_msg_t
*v
, unsigned int len
)
243 ipq_queue_element_t
*e
;
245 if (v
->value
> NF_MAX_VERDICT
)
247 e
= ipq_dequeue(q
, id_cmp
, v
->id
);
251 e
->verdict
= v
->value
;
252 if (v
->data_len
&& v
->data_len
== len
)
253 if (ipq_mangle_ipv4(v
, e
) < 0)
254 e
->verdict
= NF_DROP
;
255 nf_reinject(e
->skb
, e
->info
, e
->verdict
);
261 static int ipq_receive_peer(ipq_queue_t
*q
, ipq_peer_msg_t
*m
,
262 unsigned char type
, unsigned int len
)
267 spin_lock_bh(&q
->lock
);
268 if (q
->terminate
|| q
->flushing
)
270 spin_unlock_bh(&q
->lock
);
271 if (len
< sizeof(ipq_peer_msg_t
))
275 switch (m
->msg
.mode
.value
) {
277 q
->peer
.copy_mode
= IPQ_COPY_META
;
278 q
->peer
.copy_range
= 0;
280 case IPQ_COPY_PACKET
:
281 q
->peer
.copy_mode
= IPQ_COPY_PACKET
;
282 q
->peer
.copy_range
= m
->msg
.mode
.range
;
283 if (q
->peer
.copy_range
> 0xFFFF)
284 q
->peer
.copy_range
= 0xFFFF;
291 if (m
->msg
.verdict
.value
> NF_MAX_VERDICT
)
294 status
= ipq_set_verdict(q
,
304 static inline int dev_cmp(ipq_queue_element_t
*e
, unsigned long ifindex
)
307 if (e
->info
->indev
->ifindex
== ifindex
)
310 if (e
->info
->outdev
->ifindex
== ifindex
)
315 /* Drop any queued packets associated with device ifindex */
316 static void ipq_dev_drop(ipq_queue_t
*q
, int ifindex
)
318 ipq_queue_element_t
*e
;
320 while ((e
= ipq_dequeue(q
, dev_cmp
, ifindex
))) {
321 e
->verdict
= NF_DROP
;
322 nf_reinject(e
->skb
, e
->info
, e
->verdict
);
327 /****************************************************************************
329 * Netfilter interface
331 ****************************************************************************/
334 * Packets arrive here from netfilter for queuing to userspace.
335 * All of them must be fed back via nf_reinject() or Alexey will kill Rusty.
337 static int netfilter_receive(struct sk_buff
*skb
,
338 struct nf_info
*info
, void *data
)
340 return ipq_enqueue((ipq_queue_t
*)data
, skb
, info
);
343 /****************************************************************************
347 ****************************************************************************/
349 static struct sock
*nfnl
= NULL
;
350 ipq_queue_t
*nlq
= NULL
;
352 static struct sk_buff
*netlink_build_message(ipq_queue_element_t
*e
, int *errp
)
354 unsigned char *old_tail
;
358 ipq_packet_msg_t
*pm
;
359 struct nlmsghdr
*nlh
;
361 switch (nlq
->peer
.copy_mode
) {
365 size
= NLMSG_SPACE(sizeof(*pm
));
368 case IPQ_COPY_PACKET
:
369 copy_range
= nlq
->peer
.copy_range
;
370 if (copy_range
== 0 || copy_range
> e
->skb
->len
)
371 data_len
= e
->skb
->len
;
373 data_len
= copy_range
;
374 size
= NLMSG_SPACE(sizeof(*pm
) + data_len
);
382 skb
= alloc_skb(size
, GFP_ATOMIC
);
385 old_tail
= skb
->tail
;
386 nlh
= NLMSG_PUT(skb
, 0, 0, IPQM_PACKET
, size
- sizeof(*nlh
));
387 pm
= NLMSG_DATA(nlh
);
388 memset(pm
, 0, sizeof(*pm
));
389 pm
->packet_id
= (unsigned long )e
;
390 pm
->data_len
= data_len
;
391 pm
->timestamp_sec
= e
->skb
->stamp
.tv_sec
;
392 pm
->timestamp_usec
= e
->skb
->stamp
.tv_usec
;
393 pm
->mark
= e
->skb
->nfmark
;
394 pm
->hook
= e
->info
->hook
;
395 if (e
->info
->indev
) strcpy(pm
->indev_name
, e
->info
->indev
->name
);
396 else pm
->indev_name
[0] = '\0';
397 if (e
->info
->outdev
) strcpy(pm
->outdev_name
, e
->info
->outdev
->name
);
398 else pm
->outdev_name
[0] = '\0';
400 memcpy(pm
->payload
, e
->skb
->data
, data_len
);
401 nlh
->nlmsg_len
= skb
->tail
- old_tail
;
402 NETLINK_CB(skb
).dst_groups
= 0;
408 printk(KERN_ERR
"ip_queue: error creating netlink message\n");
412 static int netlink_send_peer(ipq_queue_element_t
*e
)
417 skb
= netlink_build_message(e
, &status
);
420 return netlink_unicast(nfnl
, skb
, nlq
->peer
.pid
, MSG_DONTWAIT
);
423 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0);
425 extern __inline__
void netlink_receive_user_skb(struct sk_buff
*skb
)
428 struct nlmsghdr
*nlh
;
430 nlh
= (struct nlmsghdr
*)skb
->data
;
431 if (nlh
->nlmsg_len
< sizeof(*nlh
)
432 || skb
->len
< nlh
->nlmsg_len
433 || nlh
->nlmsg_pid
<= 0
434 || !(nlh
->nlmsg_flags
& NLM_F_REQUEST
)
435 || nlh
->nlmsg_flags
& NLM_F_MULTI
)
436 RCV_SKB_FAIL(-EINVAL
);
437 if (nlh
->nlmsg_flags
& MSG_TRUNC
)
438 RCV_SKB_FAIL(-ECOMM
);
439 type
= nlh
->nlmsg_type
;
440 if (type
< NLMSG_NOOP
|| type
>= IPQM_MAX
)
441 RCV_SKB_FAIL(-EINVAL
);
442 if (type
<= IPQM_BASE
)
444 if(!cap_raised(NETLINK_CB(skb
).eff_cap
, CAP_NET_ADMIN
))
445 RCV_SKB_FAIL(-EPERM
);
446 if (nlq
->peer
.pid
&& !nlq
->peer
.died
447 && (nlq
->peer
.pid
!= nlh
->nlmsg_pid
)) {
448 printk(KERN_WARNING
"ip_queue: peer pid changed from %d to "
449 "%d, flushing queue\n", nlq
->peer
.pid
, nlh
->nlmsg_pid
);
452 nlq
->peer
.pid
= nlh
->nlmsg_pid
;
454 status
= ipq_receive_peer(nlq
, NLMSG_DATA(nlh
),
455 type
, skb
->len
- NLMSG_LENGTH(0));
457 RCV_SKB_FAIL(status
);
458 if (nlh
->nlmsg_flags
& NLM_F_ACK
)
459 netlink_ack(skb
, nlh
, 0);
463 /* Note: we are only dealing with single part messages at the moment. */
464 static void netlink_receive_user_sk(struct sock
*sk
, int len
)
469 if (rtnl_shlock_nowait())
471 while ((skb
= skb_dequeue(&sk
->receive_queue
)) != NULL
) {
472 netlink_receive_user_skb(skb
);
476 } while (nfnl
&& nfnl
->receive_queue
.qlen
);
479 /****************************************************************************
483 ****************************************************************************/
485 static int receive_event(struct notifier_block
*this,
486 unsigned long event
, void *ptr
)
488 struct net_device
*dev
= ptr
;
490 /* Drop any packets associated with the downed device */
491 if (event
== NETDEV_DOWN
)
492 ipq_dev_drop(nlq
, dev
->ifindex
);
496 struct notifier_block ipq_dev_notifier
= {
502 /****************************************************************************
504 * Sysctl - queue tuning.
506 ****************************************************************************/
508 static int sysctl_maxlen
= IPQ_QMAX_DEFAULT
;
510 static struct ctl_table_header
*ipq_sysctl_header
;
512 static ctl_table ipq_table
[] = {
513 { NET_IPQ_QMAX
, NET_IPQ_QMAX_NAME
, &sysctl_maxlen
,
514 sizeof(sysctl_maxlen
), 0644, NULL
, proc_dointvec
},
518 static ctl_table ipq_dir_table
[] = {
519 {NET_IPV4
, "ipv4", NULL
, 0, 0555, ipq_table
, 0, 0, 0, 0, 0},
523 static ctl_table ipq_root_table
[] = {
524 {CTL_NET
, "net", NULL
, 0, 0555, ipq_dir_table
, 0, 0, 0, 0, 0},
528 /****************************************************************************
530 * Procfs - debugging info.
532 ****************************************************************************/
534 static int ipq_get_info(char *buffer
, char **start
, off_t offset
, int length
)
538 spin_lock_bh(&nlq
->lock
);
539 len
= sprintf(buffer
,
542 "Peer copy mode : %d\n"
543 "Peer copy range : %Zu\n"
544 "Queue length : %d\n"
545 "Queue max. length : %d\n"
546 "Queue flushing : %d\n"
547 "Queue terminate : %d\n",
551 nlq
->peer
.copy_range
,
556 spin_unlock_bh(&nlq
->lock
);
557 *start
= buffer
+ offset
;
566 /****************************************************************************
570 ****************************************************************************/
572 static int __init
init(void)
576 nfnl
= netlink_kernel_create(NETLINK_FIREWALL
, netlink_receive_user_sk
);
578 printk(KERN_ERR
"ip_queue: initialisation failed: unable to "
579 "create kernel netlink socket\n");
582 nlq
= ipq_create_queue(netfilter_receive
,
583 netlink_send_peer
, &status
, &sysctl_maxlen
);
585 printk(KERN_ERR
"ip_queue: initialisation failed: unable to "
587 sock_release(nfnl
->socket
);
590 register_netdevice_notifier(&ipq_dev_notifier
);
591 proc_net_create(IPQ_PROC_FS_NAME
, 0, ipq_get_info
);
592 ipq_sysctl_header
= register_sysctl_table(ipq_root_table
, 0);
596 static void __exit
fini(void)
598 unregister_sysctl_table(ipq_sysctl_header
);
599 proc_net_remove(IPQ_PROC_FS_NAME
);
600 unregister_netdevice_notifier(&ipq_dev_notifier
);
601 ipq_destroy_queue(nlq
);
602 sock_release(nfnl
->socket
);
605 MODULE_DESCRIPTION("IPv4 packet queue handler");