2 * This is a module which is used for queueing IPv4 packets and
3 * communicating with userspace via netlink.
5 * (C) 2000 James Morris, this code is GPL.
7 * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
8 * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
9 * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
11 * 2000-08-01: Added Nick Williams' MAC support.
14 #include <linux/module.h>
15 #include <linux/skbuff.h>
16 #include <linux/init.h>
18 #include <linux/notifier.h>
19 #include <linux/netdevice.h>
20 #include <linux/netfilter.h>
21 #include <linux/netlink.h>
22 #include <linux/spinlock.h>
23 #include <linux/rtnetlink.h>
24 #include <linux/sysctl.h>
25 #include <linux/proc_fs.h>
28 #include <linux/netfilter_ipv4/ip_queue.h>
30 #define IPQ_QMAX_DEFAULT 1024
31 #define IPQ_PROC_FS_NAME "ip_queue"
32 #define NET_IPQ_QMAX 2088
33 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
35 typedef struct ipq_queue_element
{
36 struct list_head list
; /* Links element into queue */
37 int verdict
; /* Current verdict */
38 struct nf_info
*info
; /* Extra info from netfilter */
39 struct sk_buff
*skb
; /* Packet inside */
40 } ipq_queue_element_t
;
42 typedef int (*ipq_send_cb_t
)(ipq_queue_element_t
*e
);
44 typedef struct ipq_peer
{
45 pid_t pid
; /* PID of userland peer */
46 unsigned char died
; /* We think the peer died */
47 unsigned char copy_mode
; /* Copy packet as well as metadata? */
48 size_t copy_range
; /* Range past metadata to copy */
49 ipq_send_cb_t send
; /* Callback for sending data to peer */
52 typedef struct ipq_queue
{
53 int len
; /* Current queue len */
54 int *maxlen
; /* Maximum queue len, via sysctl */
55 unsigned char flushing
; /* If queue is being flushed */
56 unsigned char terminate
; /* If the queue is being terminated */
57 struct list_head list
; /* Head of packet queue */
58 spinlock_t lock
; /* Queue spinlock */
59 ipq_peer_t peer
; /* Userland peer */
62 /****************************************************************************
66 ****************************************************************************/
68 /* Dequeue a packet if matched by cmp, or the next available if cmp is NULL */
69 static ipq_queue_element_t
*
70 ipq_dequeue(ipq_queue_t
*q
,
71 int (*cmp
)(ipq_queue_element_t
*, unsigned long),
76 spin_lock_bh(&q
->lock
);
77 for (i
= q
->list
.prev
; i
!= &q
->list
; i
= i
->prev
) {
78 ipq_queue_element_t
*e
= (ipq_queue_element_t
*)i
;
80 if (!cmp
|| cmp(e
, data
)) {
83 spin_unlock_bh(&q
->lock
);
87 spin_unlock_bh(&q
->lock
);
91 /* Flush all packets */
92 static void ipq_flush(ipq_queue_t
*q
)
94 ipq_queue_element_t
*e
;
96 spin_lock_bh(&q
->lock
);
98 spin_unlock_bh(&q
->lock
);
99 while ((e
= ipq_dequeue(q
, NULL
, 0))) {
100 e
->verdict
= NF_DROP
;
101 nf_reinject(e
->skb
, e
->info
, e
->verdict
);
104 spin_lock_bh(&q
->lock
);
106 spin_unlock_bh(&q
->lock
);
109 static ipq_queue_t
*ipq_create_queue(nf_queue_outfn_t outfn
,
110 ipq_send_cb_t send_cb
,
111 int *errp
, int *sysctl_qmax
)
117 q
= kmalloc(sizeof(ipq_queue_t
), GFP_KERNEL
);
124 q
->peer
.copy_mode
= IPQ_COPY_NONE
;
125 q
->peer
.copy_range
= 0;
126 q
->peer
.send
= send_cb
;
128 q
->maxlen
= sysctl_qmax
;
131 INIT_LIST_HEAD(&q
->list
);
132 spin_lock_init(&q
->lock
);
133 status
= nf_register_queue_handler(PF_INET
, outfn
, q
);
142 static int ipq_enqueue(ipq_queue_t
*q
,
143 struct sk_buff
*skb
, struct nf_info
*info
)
145 ipq_queue_element_t
*e
;
148 e
= kmalloc(sizeof(*e
), GFP_ATOMIC
);
150 printk(KERN_ERR
"ip_queue: OOM in enqueue\n");
153 e
->verdict
= NF_DROP
;
156 spin_lock_bh(&q
->lock
);
157 if (q
->len
>= *q
->maxlen
) {
158 spin_unlock_bh(&q
->lock
);
160 printk(KERN_WARNING
"ip_queue: full at %d entries, "
161 "dropping packet(s).\n", q
->len
);
164 if (q
->flushing
|| q
->peer
.copy_mode
== IPQ_COPY_NONE
165 || q
->peer
.pid
== 0 || q
->peer
.died
|| q
->terminate
) {
166 spin_unlock_bh(&q
->lock
);
169 status
= q
->peer
.send(e
);
171 list_add(&e
->list
, &q
->list
);
173 spin_unlock_bh(&q
->lock
);
176 spin_unlock_bh(&q
->lock
);
177 if (status
== -ECONNREFUSED
) {
178 printk(KERN_INFO
"ip_queue: peer %d died, "
179 "resetting state and flushing queue\n", q
->peer
.pid
);
182 q
->peer
.copy_mode
= IPQ_COPY_NONE
;
183 q
->peer
.copy_range
= 0;
191 static void ipq_destroy_queue(ipq_queue_t
*q
)
193 nf_unregister_queue_handler(PF_INET
);
194 spin_lock_bh(&q
->lock
);
196 spin_unlock_bh(&q
->lock
);
201 static int ipq_mangle_ipv4(ipq_verdict_msg_t
*v
, ipq_queue_element_t
*e
)
204 struct iphdr
*user_iph
= (struct iphdr
*)v
->payload
;
206 if (v
->data_len
< sizeof(*user_iph
))
208 diff
= v
->data_len
- e
->skb
->len
;
210 skb_trim(e
->skb
, v
->data_len
);
212 if (v
->data_len
> 0xFFFF)
214 if (diff
> skb_tailroom(e
->skb
)) {
215 struct sk_buff
*newskb
;
217 newskb
= skb_copy_expand(e
->skb
,
218 skb_headroom(e
->skb
),
221 if (newskb
== NULL
) {
222 printk(KERN_WARNING
"ip_queue: OOM "
223 "in mangle, dropping packet\n");
229 skb_put(e
->skb
, diff
);
231 memcpy(e
->skb
->data
, v
->payload
, v
->data_len
);
232 e
->skb
->nfcache
|= NFC_ALTERED
;
236 static inline int id_cmp(ipq_queue_element_t
*e
, unsigned long id
)
238 return (id
== (unsigned long )e
);
241 static int ipq_set_verdict(ipq_queue_t
*q
,
242 ipq_verdict_msg_t
*v
, unsigned int len
)
244 ipq_queue_element_t
*e
;
246 if (v
->value
> NF_MAX_VERDICT
)
248 e
= ipq_dequeue(q
, id_cmp
, v
->id
);
252 e
->verdict
= v
->value
;
253 if (v
->data_len
&& v
->data_len
== len
)
254 if (ipq_mangle_ipv4(v
, e
) < 0)
255 e
->verdict
= NF_DROP
;
256 nf_reinject(e
->skb
, e
->info
, e
->verdict
);
262 static int ipq_receive_peer(ipq_queue_t
*q
, ipq_peer_msg_t
*m
,
263 unsigned char type
, unsigned int len
)
269 spin_lock_bh(&q
->lock
);
270 busy
= (q
->terminate
|| q
->flushing
);
271 spin_unlock_bh(&q
->lock
);
274 if (len
< sizeof(ipq_peer_msg_t
))
278 switch (m
->msg
.mode
.value
) {
280 q
->peer
.copy_mode
= IPQ_COPY_META
;
281 q
->peer
.copy_range
= 0;
283 case IPQ_COPY_PACKET
:
284 q
->peer
.copy_mode
= IPQ_COPY_PACKET
;
285 q
->peer
.copy_range
= m
->msg
.mode
.range
;
286 if (q
->peer
.copy_range
> 0xFFFF)
287 q
->peer
.copy_range
= 0xFFFF;
294 if (m
->msg
.verdict
.value
> NF_MAX_VERDICT
)
297 status
= ipq_set_verdict(q
,
307 static inline int dev_cmp(ipq_queue_element_t
*e
, unsigned long ifindex
)
310 if (e
->info
->indev
->ifindex
== ifindex
)
313 if (e
->info
->outdev
->ifindex
== ifindex
)
318 /* Drop any queued packets associated with device ifindex */
319 static void ipq_dev_drop(ipq_queue_t
*q
, int ifindex
)
321 ipq_queue_element_t
*e
;
323 while ((e
= ipq_dequeue(q
, dev_cmp
, ifindex
))) {
324 e
->verdict
= NF_DROP
;
325 nf_reinject(e
->skb
, e
->info
, e
->verdict
);
330 /****************************************************************************
332 * Netfilter interface
334 ****************************************************************************/
337 * Packets arrive here from netfilter for queuing to userspace.
338 * All of them must be fed back via nf_reinject() or Alexey will kill Rusty.
340 static int netfilter_receive(struct sk_buff
*skb
,
341 struct nf_info
*info
, void *data
)
343 return ipq_enqueue((ipq_queue_t
*)data
, skb
, info
);
346 /****************************************************************************
350 ****************************************************************************/
352 static struct sock
*nfnl
= NULL
;
353 ipq_queue_t
*nlq
= NULL
;
355 static struct sk_buff
*netlink_build_message(ipq_queue_element_t
*e
, int *errp
)
357 unsigned char *old_tail
;
361 ipq_packet_msg_t
*pm
;
362 struct nlmsghdr
*nlh
;
364 switch (nlq
->peer
.copy_mode
) {
368 size
= NLMSG_SPACE(sizeof(*pm
));
371 case IPQ_COPY_PACKET
:
372 copy_range
= nlq
->peer
.copy_range
;
373 if (copy_range
== 0 || copy_range
> e
->skb
->len
)
374 data_len
= e
->skb
->len
;
376 data_len
= copy_range
;
377 size
= NLMSG_SPACE(sizeof(*pm
) + data_len
);
385 skb
= alloc_skb(size
, GFP_ATOMIC
);
388 old_tail
= skb
->tail
;
389 nlh
= NLMSG_PUT(skb
, 0, 0, IPQM_PACKET
, size
- sizeof(*nlh
));
390 pm
= NLMSG_DATA(nlh
);
391 memset(pm
, 0, sizeof(*pm
));
392 pm
->packet_id
= (unsigned long )e
;
393 pm
->data_len
= data_len
;
394 pm
->timestamp_sec
= e
->skb
->stamp
.tv_sec
;
395 pm
->timestamp_usec
= e
->skb
->stamp
.tv_usec
;
396 pm
->mark
= e
->skb
->nfmark
;
397 pm
->hook
= e
->info
->hook
;
398 if (e
->info
->indev
) strcpy(pm
->indev_name
, e
->info
->indev
->name
);
399 else pm
->indev_name
[0] = '\0';
400 if (e
->info
->outdev
) strcpy(pm
->outdev_name
, e
->info
->outdev
->name
);
401 else pm
->outdev_name
[0] = '\0';
402 pm
->hw_protocol
= e
->skb
->protocol
;
404 memcpy(pm
->payload
, e
->skb
->data
, data_len
);
405 nlh
->nlmsg_len
= skb
->tail
- old_tail
;
406 NETLINK_CB(skb
).dst_groups
= 0;
412 printk(KERN_ERR
"ip_queue: error creating netlink message\n");
416 static int netlink_send_peer(ipq_queue_element_t
*e
)
421 skb
= netlink_build_message(e
, &status
);
424 return netlink_unicast(nfnl
, skb
, nlq
->peer
.pid
, MSG_DONTWAIT
);
427 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0);
429 extern __inline__
void netlink_receive_user_skb(struct sk_buff
*skb
)
432 struct nlmsghdr
*nlh
;
434 nlh
= (struct nlmsghdr
*)skb
->data
;
435 if (nlh
->nlmsg_len
< sizeof(*nlh
)
436 || skb
->len
< nlh
->nlmsg_len
437 || nlh
->nlmsg_pid
<= 0
438 || !(nlh
->nlmsg_flags
& NLM_F_REQUEST
)
439 || nlh
->nlmsg_flags
& NLM_F_MULTI
)
440 RCV_SKB_FAIL(-EINVAL
);
441 if (nlh
->nlmsg_flags
& MSG_TRUNC
)
442 RCV_SKB_FAIL(-ECOMM
);
443 type
= nlh
->nlmsg_type
;
444 if (type
< NLMSG_NOOP
|| type
>= IPQM_MAX
)
445 RCV_SKB_FAIL(-EINVAL
);
446 if (type
<= IPQM_BASE
)
448 if(!cap_raised(NETLINK_CB(skb
).eff_cap
, CAP_NET_ADMIN
))
449 RCV_SKB_FAIL(-EPERM
);
450 if (nlq
->peer
.pid
&& !nlq
->peer
.died
451 && (nlq
->peer
.pid
!= nlh
->nlmsg_pid
)) {
452 printk(KERN_WARNING
"ip_queue: peer pid changed from %d to "
453 "%d, flushing queue\n", nlq
->peer
.pid
, nlh
->nlmsg_pid
);
456 nlq
->peer
.pid
= nlh
->nlmsg_pid
;
458 status
= ipq_receive_peer(nlq
, NLMSG_DATA(nlh
),
459 type
, skb
->len
- NLMSG_LENGTH(0));
461 RCV_SKB_FAIL(status
);
462 if (nlh
->nlmsg_flags
& NLM_F_ACK
)
463 netlink_ack(skb
, nlh
, 0);
467 /* Note: we are only dealing with single part messages at the moment. */
468 static void netlink_receive_user_sk(struct sock
*sk
, int len
)
473 if (rtnl_shlock_nowait())
475 while ((skb
= skb_dequeue(&sk
->receive_queue
)) != NULL
) {
476 netlink_receive_user_skb(skb
);
480 } while (nfnl
&& nfnl
->receive_queue
.qlen
);
483 /****************************************************************************
487 ****************************************************************************/
489 static int receive_event(struct notifier_block
*this,
490 unsigned long event
, void *ptr
)
492 struct net_device
*dev
= ptr
;
494 /* Drop any packets associated with the downed device */
495 if (event
== NETDEV_DOWN
)
496 ipq_dev_drop(nlq
, dev
->ifindex
);
500 struct notifier_block ipq_dev_notifier
= {
506 /****************************************************************************
508 * Sysctl - queue tuning.
510 ****************************************************************************/
512 static int sysctl_maxlen
= IPQ_QMAX_DEFAULT
;
514 static struct ctl_table_header
*ipq_sysctl_header
;
516 static ctl_table ipq_table
[] = {
517 { NET_IPQ_QMAX
, NET_IPQ_QMAX_NAME
, &sysctl_maxlen
,
518 sizeof(sysctl_maxlen
), 0644, NULL
, proc_dointvec
},
522 static ctl_table ipq_dir_table
[] = {
523 {NET_IPV4
, "ipv4", NULL
, 0, 0555, ipq_table
, 0, 0, 0, 0, 0},
527 static ctl_table ipq_root_table
[] = {
528 {CTL_NET
, "net", NULL
, 0, 0555, ipq_dir_table
, 0, 0, 0, 0, 0},
532 /****************************************************************************
534 * Procfs - debugging info.
536 ****************************************************************************/
538 static int ipq_get_info(char *buffer
, char **start
, off_t offset
, int length
)
542 spin_lock_bh(&nlq
->lock
);
543 len
= sprintf(buffer
,
546 "Peer copy mode : %d\n"
547 "Peer copy range : %Zu\n"
548 "Queue length : %d\n"
549 "Queue max. length : %d\n"
550 "Queue flushing : %d\n"
551 "Queue terminate : %d\n",
555 nlq
->peer
.copy_range
,
560 spin_unlock_bh(&nlq
->lock
);
561 *start
= buffer
+ offset
;
570 /****************************************************************************
574 ****************************************************************************/
576 static int __init
init(void)
580 nfnl
= netlink_kernel_create(NETLINK_FIREWALL
, netlink_receive_user_sk
);
582 printk(KERN_ERR
"ip_queue: initialisation failed: unable to "
583 "create kernel netlink socket\n");
586 nlq
= ipq_create_queue(netfilter_receive
,
587 netlink_send_peer
, &status
, &sysctl_maxlen
);
589 printk(KERN_ERR
"ip_queue: initialisation failed: unable to "
591 sock_release(nfnl
->socket
);
594 register_netdevice_notifier(&ipq_dev_notifier
);
595 proc_net_create(IPQ_PROC_FS_NAME
, 0, ipq_get_info
);
596 ipq_sysctl_header
= register_sysctl_table(ipq_root_table
, 0);
600 static void __exit
fini(void)
602 unregister_sysctl_table(ipq_sysctl_header
);
603 proc_net_remove(IPQ_PROC_FS_NAME
);
604 unregister_netdevice_notifier(&ipq_dev_notifier
);
605 ipq_destroy_queue(nlq
);
606 sock_release(nfnl
->socket
);
609 MODULE_DESCRIPTION("IPv4 packet queue handler");