Import 2.4.0-test6pre3
[davej-history.git] / net / ipv4 / netfilter / ip_queue.c
blob2ac8b22e2742cd8529740245fe4ba5a668c1bfbf
1 /*
2 * This is a module which is used for queueing IPv4 packets and
3 * communicating with userspace via netlink.
5 * (C) 2000 James Morris, this code is GPL.
7 * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). (JM)
8 * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). (JM)
9 * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian
10 * Zander). (JM)
13 #include <linux/module.h>
14 #include <linux/skbuff.h>
15 #include <linux/init.h>
16 #include <linux/ip.h>
17 #include <linux/notifier.h>
18 #include <linux/netdevice.h>
19 #include <linux/netfilter.h>
20 #include <linux/netlink.h>
21 #include <linux/spinlock.h>
22 #include <linux/rtnetlink.h>
23 #include <linux/sysctl.h>
24 #include <linux/proc_fs.h>
25 #include <net/sock.h>
27 #include <linux/netfilter_ipv4/ip_queue.h>
29 #define IPQ_QMAX_DEFAULT 1024
30 #define IPQ_PROC_FS_NAME "ip_queue"
31 #define NET_IPQ_QMAX 2088
32 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
34 typedef struct ipq_queue_element {
35 struct list_head list; /* Links element into queue */
36 int verdict; /* Current verdict */
37 struct nf_info *info; /* Extra info from netfilter */
38 struct sk_buff *skb; /* Packet inside */
39 } ipq_queue_element_t;
41 typedef int (*ipq_send_cb_t)(ipq_queue_element_t *e);
43 typedef struct ipq_peer {
44 pid_t pid; /* PID of userland peer */
45 unsigned char died; /* We think the peer died */
46 unsigned char copy_mode; /* Copy packet as well as metadata? */
47 size_t copy_range; /* Range past metadata to copy */
48 ipq_send_cb_t send; /* Callback for sending data to peer */
49 } ipq_peer_t;
51 typedef struct ipq_queue {
52 int len; /* Current queue len */
53 int *maxlen; /* Maximum queue len, via sysctl */
54 unsigned char flushing; /* If queue is being flushed */
55 unsigned char terminate; /* If the queue is being terminated */
56 struct list_head list; /* Head of packet queue */
57 spinlock_t lock; /* Queue spinlock */
58 ipq_peer_t peer; /* Userland peer */
59 } ipq_queue_t;
61 /****************************************************************************
63 * Packet queue
65 ****************************************************************************/
67 /* Dequeue a packet if matched by cmp, or the next available if cmp is NULL */
68 static ipq_queue_element_t *
69 ipq_dequeue(ipq_queue_t *q,
70 int (*cmp)(ipq_queue_element_t *, unsigned long),
71 unsigned long data)
73 struct list_head *i;
75 spin_lock_bh(&q->lock);
76 for (i = q->list.prev; i != &q->list; i = i->prev) {
77 ipq_queue_element_t *e = (ipq_queue_element_t *)i;
79 if (!cmp || cmp(e, data)) {
80 list_del(&e->list);
81 q->len--;
82 spin_unlock_bh(&q->lock);
83 return e;
86 spin_unlock_bh(&q->lock);
87 return NULL;
90 /* Flush all packets */
91 static void ipq_flush(ipq_queue_t *q)
93 ipq_queue_element_t *e;
95 spin_lock_bh(&q->lock);
96 q->flushing = 1;
97 spin_unlock_bh(&q->lock);
98 while ((e = ipq_dequeue(q, NULL, 0))) {
99 e->verdict = NF_DROP;
100 nf_reinject(e->skb, e->info, e->verdict);
101 kfree(e);
103 spin_lock_bh(&q->lock);
104 q->flushing = 0;
105 spin_unlock_bh(&q->lock);
108 static ipq_queue_t *ipq_create_queue(nf_queue_outfn_t outfn,
109 ipq_send_cb_t send_cb,
110 int *errp, int *sysctl_qmax)
112 int status;
113 ipq_queue_t *q;
115 *errp = 0;
116 q = kmalloc(sizeof(ipq_queue_t), GFP_KERNEL);
117 if (q == NULL) {
118 *errp = -ENOMEM;
119 return NULL;
121 q->peer.pid = 0;
122 q->peer.died = 0;
123 q->peer.copy_mode = IPQ_COPY_NONE;
124 q->peer.copy_range = 0;
125 q->peer.send = send_cb;
126 q->len = 0;
127 q->maxlen = sysctl_qmax;
128 q->flushing = 0;
129 q->terminate = 0;
130 INIT_LIST_HEAD(&q->list);
131 spin_lock_init(&q->lock);
132 status = nf_register_queue_handler(PF_INET, outfn, q);
133 if (status < 0) {
134 *errp = -EBUSY;
135 kfree(q);
136 return NULL;
138 return q;
141 static int ipq_enqueue(ipq_queue_t *q,
142 struct sk_buff *skb, struct nf_info *info)
144 ipq_queue_element_t *e;
145 int status;
147 e = kmalloc(sizeof(*e), GFP_ATOMIC);
148 if (e == NULL) {
149 printk(KERN_ERR "ip_queue: OOM in enqueue\n");
150 return -ENOMEM;
152 e->verdict = NF_DROP;
153 e->info = info;
154 e->skb = skb;
155 spin_lock_bh(&q->lock);
156 if (q->len >= *q->maxlen) {
157 spin_unlock_bh(&q->lock);
158 if (net_ratelimit())
159 printk(KERN_WARNING "ip_queue: full at %d entries, "
160 "dropping packet(s).\n", q->len);
161 goto free_drop;
163 if (q->flushing || q->peer.copy_mode == IPQ_COPY_NONE
164 || q->peer.pid == 0 || q->peer.died || q->terminate) {
165 spin_unlock_bh(&q->lock);
166 goto free_drop;
168 status = q->peer.send(e);
169 if (status > 0) {
170 list_add(&e->list, &q->list);
171 q->len++;
172 spin_unlock_bh(&q->lock);
173 return status;
175 spin_unlock_bh(&q->lock);
176 if (status == -ECONNREFUSED) {
177 printk(KERN_INFO "ip_queue: peer %d died, "
178 "resetting state and flushing queue\n", q->peer.pid);
179 q->peer.died = 1;
180 q->peer.pid = 0;
181 q->peer.copy_mode = IPQ_COPY_NONE;
182 q->peer.copy_range = 0;
183 ipq_flush(q);
185 free_drop:
186 kfree(e);
187 return -EBUSY;
190 static void ipq_destroy_queue(ipq_queue_t *q)
192 nf_unregister_queue_handler(PF_INET);
193 spin_lock_bh(&q->lock);
194 q->terminate = 1;
195 spin_unlock_bh(&q->lock);
196 ipq_flush(q);
197 kfree(q);
200 static int ipq_mangle_ipv4(ipq_verdict_msg_t *v, ipq_queue_element_t *e)
202 int diff;
203 struct iphdr *user_iph = (struct iphdr *)v->payload;
205 if (v->data_len < sizeof(*user_iph))
206 return 0;
207 diff = v->data_len - e->skb->len;
208 if (diff < 0)
209 skb_trim(e->skb, v->data_len);
210 else if (diff > 0) {
211 if (v->data_len > 0xFFFF)
212 return -EINVAL;
213 if (diff > skb_tailroom(e->skb)) {
214 struct sk_buff *newskb;
216 newskb = skb_copy_expand(e->skb,
217 skb_headroom(e->skb),
218 diff,
219 GFP_ATOMIC);
220 if (newskb == NULL) {
221 printk(KERN_WARNING "ip_queue: OOM "
222 "in mangle, dropping packet\n");
223 return -ENOMEM;
225 kfree_skb(e->skb);
226 e->skb = newskb;
228 skb_put(e->skb, diff);
230 memcpy(e->skb->data, v->payload, v->data_len);
231 e->skb->nfcache |= NFC_ALTERED;
232 return 0;
235 static inline int id_cmp(ipq_queue_element_t *e, unsigned long id)
237 return (id == (unsigned long )e);
240 static int ipq_set_verdict(ipq_queue_t *q,
241 ipq_verdict_msg_t *v, unsigned int len)
243 ipq_queue_element_t *e;
245 if (v->value > NF_MAX_VERDICT)
246 return -EINVAL;
247 e = ipq_dequeue(q, id_cmp, v->id);
248 if (e == NULL)
249 return -ENOENT;
250 else {
251 e->verdict = v->value;
252 if (v->data_len && v->data_len == len)
253 if (ipq_mangle_ipv4(v, e) < 0)
254 e->verdict = NF_DROP;
255 nf_reinject(e->skb, e->info, e->verdict);
256 kfree(e);
257 return 0;
261 static int ipq_receive_peer(ipq_queue_t *q, ipq_peer_msg_t *m,
262 unsigned char type, unsigned int len)
265 int status = 0;
267 spin_lock_bh(&q->lock);
268 if (q->terminate || q->flushing)
269 return -EBUSY;
270 spin_unlock_bh(&q->lock);
271 if (len < sizeof(ipq_peer_msg_t))
272 return -EINVAL;
273 switch (type) {
274 case IPQM_MODE:
275 switch (m->msg.mode.value) {
276 case IPQ_COPY_META:
277 q->peer.copy_mode = IPQ_COPY_META;
278 q->peer.copy_range = 0;
279 break;
280 case IPQ_COPY_PACKET:
281 q->peer.copy_mode = IPQ_COPY_PACKET;
282 q->peer.copy_range = m->msg.mode.range;
283 if (q->peer.copy_range > 0xFFFF)
284 q->peer.copy_range = 0xFFFF;
285 break;
286 default:
287 status = -EINVAL;
289 break;
290 case IPQM_VERDICT:
291 if (m->msg.verdict.value > NF_MAX_VERDICT)
292 status = -EINVAL;
293 else
294 status = ipq_set_verdict(q,
295 &m->msg.verdict,
296 len - sizeof(*m));
297 break;
298 default:
299 status = -EINVAL;
301 return status;
304 static inline int dev_cmp(ipq_queue_element_t *e, unsigned long ifindex)
306 if (e->info->indev)
307 if (e->info->indev->ifindex == ifindex)
308 return 1;
309 if (e->info->outdev)
310 if (e->info->outdev->ifindex == ifindex)
311 return 1;
312 return 0;
315 /* Drop any queued packets associated with device ifindex */
316 static void ipq_dev_drop(ipq_queue_t *q, int ifindex)
318 ipq_queue_element_t *e;
320 while ((e = ipq_dequeue(q, dev_cmp, ifindex))) {
321 e->verdict = NF_DROP;
322 nf_reinject(e->skb, e->info, e->verdict);
323 kfree(e);
327 /****************************************************************************
329 * Netfilter interface
331 ****************************************************************************/
334 * Packets arrive here from netfilter for queuing to userspace.
335 * All of them must be fed back via nf_reinject() or Alexey will kill Rusty.
337 static int netfilter_receive(struct sk_buff *skb,
338 struct nf_info *info, void *data)
340 return ipq_enqueue((ipq_queue_t *)data, skb, info);
343 /****************************************************************************
345 * Netlink interface.
347 ****************************************************************************/
349 static struct sock *nfnl = NULL;
350 ipq_queue_t *nlq = NULL;
352 static struct sk_buff *netlink_build_message(ipq_queue_element_t *e, int *errp)
354 unsigned char *old_tail;
355 size_t size = 0;
356 size_t data_len = 0;
357 struct sk_buff *skb;
358 ipq_packet_msg_t *pm;
359 struct nlmsghdr *nlh;
361 switch (nlq->peer.copy_mode) {
362 size_t copy_range;
364 case IPQ_COPY_META:
365 size = NLMSG_SPACE(sizeof(*pm));
366 data_len = 0;
367 break;
368 case IPQ_COPY_PACKET:
369 copy_range = nlq->peer.copy_range;
370 if (copy_range == 0 || copy_range > e->skb->len)
371 data_len = e->skb->len;
372 else
373 data_len = copy_range;
374 size = NLMSG_SPACE(sizeof(*pm) + data_len);
376 break;
377 case IPQ_COPY_NONE:
378 default:
379 *errp = -EINVAL;
380 return NULL;
382 skb = alloc_skb(size, GFP_ATOMIC);
383 if (!skb)
384 goto nlmsg_failure;
385 old_tail = skb->tail;
386 nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
387 pm = NLMSG_DATA(nlh);
388 memset(pm, 0, sizeof(*pm));
389 pm->packet_id = (unsigned long )e;
390 pm->data_len = data_len;
391 pm->timestamp_sec = e->skb->stamp.tv_sec;
392 pm->timestamp_usec = e->skb->stamp.tv_usec;
393 pm->mark = e->skb->nfmark;
394 pm->hook = e->info->hook;
395 if (e->info->indev) strcpy(pm->indev_name, e->info->indev->name);
396 else pm->indev_name[0] = '\0';
397 if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name);
398 else pm->outdev_name[0] = '\0';
399 if (data_len)
400 memcpy(pm->payload, e->skb->data, data_len);
401 nlh->nlmsg_len = skb->tail - old_tail;
402 NETLINK_CB(skb).dst_groups = 0;
403 return skb;
404 nlmsg_failure:
405 if (skb)
406 kfree(skb);
407 *errp = 0;
408 printk(KERN_ERR "ip_queue: error creating netlink message\n");
409 return NULL;
412 static int netlink_send_peer(ipq_queue_element_t *e)
414 int status = 0;
415 struct sk_buff *skb;
417 skb = netlink_build_message(e, &status);
418 if (skb == NULL)
419 return status;
420 return netlink_unicast(nfnl, skb, nlq->peer.pid, MSG_DONTWAIT);
423 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0);
425 extern __inline__ void netlink_receive_user_skb(struct sk_buff *skb)
427 int status, type;
428 struct nlmsghdr *nlh;
430 nlh = (struct nlmsghdr *)skb->data;
431 if (nlh->nlmsg_len < sizeof(*nlh)
432 || skb->len < nlh->nlmsg_len
433 || nlh->nlmsg_pid <= 0
434 || !(nlh->nlmsg_flags & NLM_F_REQUEST)
435 || nlh->nlmsg_flags & NLM_F_MULTI)
436 RCV_SKB_FAIL(-EINVAL);
437 if (nlh->nlmsg_flags & MSG_TRUNC)
438 RCV_SKB_FAIL(-ECOMM);
439 type = nlh->nlmsg_type;
440 if (type < NLMSG_NOOP || type >= IPQM_MAX)
441 RCV_SKB_FAIL(-EINVAL);
442 if (type <= IPQM_BASE)
443 return;
444 if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
445 RCV_SKB_FAIL(-EPERM);
446 if (nlq->peer.pid && !nlq->peer.died
447 && (nlq->peer.pid != nlh->nlmsg_pid)) {
448 printk(KERN_WARNING "ip_queue: peer pid changed from %d to "
449 "%d, flushing queue\n", nlq->peer.pid, nlh->nlmsg_pid);
450 ipq_flush(nlq);
452 nlq->peer.pid = nlh->nlmsg_pid;
453 nlq->peer.died = 0;
454 status = ipq_receive_peer(nlq, NLMSG_DATA(nlh),
455 type, skb->len - NLMSG_LENGTH(0));
456 if (status < 0)
457 RCV_SKB_FAIL(status);
458 if (nlh->nlmsg_flags & NLM_F_ACK)
459 netlink_ack(skb, nlh, 0);
460 return;
463 /* Note: we are only dealing with single part messages at the moment. */
464 static void netlink_receive_user_sk(struct sock *sk, int len)
466 do {
467 struct sk_buff *skb;
469 if (rtnl_shlock_nowait())
470 return;
471 while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
472 netlink_receive_user_skb(skb);
473 kfree_skb(skb);
475 up(&rtnl_sem);
476 } while (nfnl && nfnl->receive_queue.qlen);
479 /****************************************************************************
481 * System events
483 ****************************************************************************/
485 static int receive_event(struct notifier_block *this,
486 unsigned long event, void *ptr)
488 struct net_device *dev = ptr;
490 /* Drop any packets associated with the downed device */
491 if (event == NETDEV_DOWN)
492 ipq_dev_drop(nlq, dev->ifindex);
493 return NOTIFY_DONE;
496 struct notifier_block ipq_dev_notifier = {
497 receive_event,
498 NULL,
502 /****************************************************************************
504 * Sysctl - queue tuning.
506 ****************************************************************************/
508 static int sysctl_maxlen = IPQ_QMAX_DEFAULT;
510 static struct ctl_table_header *ipq_sysctl_header;
512 static ctl_table ipq_table[] = {
513 { NET_IPQ_QMAX, NET_IPQ_QMAX_NAME, &sysctl_maxlen,
514 sizeof(sysctl_maxlen), 0644, NULL, proc_dointvec },
515 { 0 }
518 static ctl_table ipq_dir_table[] = {
519 {NET_IPV4, "ipv4", NULL, 0, 0555, ipq_table, 0, 0, 0, 0, 0},
520 { 0 }
523 static ctl_table ipq_root_table[] = {
524 {CTL_NET, "net", NULL, 0, 0555, ipq_dir_table, 0, 0, 0, 0, 0},
525 { 0 }
528 /****************************************************************************
530 * Procfs - debugging info.
532 ****************************************************************************/
534 static int ipq_get_info(char *buffer, char **start, off_t offset, int length)
536 int len;
538 spin_lock_bh(&nlq->lock);
539 len = sprintf(buffer,
540 "Peer pid : %d\n"
541 "Peer died : %d\n"
542 "Peer copy mode : %d\n"
543 "Peer copy range : %Zu\n"
544 "Queue length : %d\n"
545 "Queue max. length : %d\n"
546 "Queue flushing : %d\n"
547 "Queue terminate : %d\n",
548 nlq->peer.pid,
549 nlq->peer.died,
550 nlq->peer.copy_mode,
551 nlq->peer.copy_range,
552 nlq->len,
553 *nlq->maxlen,
554 nlq->flushing,
555 nlq->terminate);
556 spin_unlock_bh(&nlq->lock);
557 *start = buffer + offset;
558 len -= offset;
559 if (len > length)
560 len = length;
561 else if (len < 0)
562 len = 0;
563 return len;
566 /****************************************************************************
568 * Module stuff.
570 ****************************************************************************/
572 static int __init init(void)
574 int status = 0;
576 nfnl = netlink_kernel_create(NETLINK_FIREWALL, netlink_receive_user_sk);
577 if (nfnl == NULL) {
578 printk(KERN_ERR "ip_queue: initialisation failed: unable to "
579 "create kernel netlink socket\n");
580 return -ENOMEM;
582 nlq = ipq_create_queue(netfilter_receive,
583 netlink_send_peer, &status, &sysctl_maxlen);
584 if (nlq == NULL) {
585 printk(KERN_ERR "ip_queue: initialisation failed: unable to "
586 "create queue\n");
587 sock_release(nfnl->socket);
588 return status;
590 register_netdevice_notifier(&ipq_dev_notifier);
591 proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
592 ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
593 return status;
596 static void __exit fini(void)
598 unregister_sysctl_table(ipq_sysctl_header);
599 proc_net_remove(IPQ_PROC_FS_NAME);
600 unregister_netdevice_notifier(&ipq_dev_notifier);
601 ipq_destroy_queue(nlq);
602 sock_release(nfnl->socket);
605 MODULE_DESCRIPTION("IPv4 packet queue handler");
606 module_init(init);
607 module_exit(fini);