[NETFILTER]: nf_conntrack: properly use RCU for nf_conntrack_destroyed callback
[linux-2.6.22.y-op.git] / net / packet / af_packet.c
blob15ff7b15e2112c675e97b582376ce27c2c951af0
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
8 * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
14 * Fixes:
15 * Alan Cox : verify_area() now used correctly
16 * Alan Cox : new skbuff lists, look ma no backlogs!
17 * Alan Cox : tidied skbuff lists.
18 * Alan Cox : Now uses generic datagram routines I
19 * added. Also fixed the peek/read crash
20 * from all old Linux datagram code.
21 * Alan Cox : Uses the improved datagram code.
22 * Alan Cox : Added NULL's for socket options.
23 * Alan Cox : Re-commented the code.
24 * Alan Cox : Use new kernel side addressing
25 * Rob Janssen : Correct MTU usage.
26 * Dave Platt : Counter leaks caused by incorrect
27 * interrupt locking and some slightly
28 * dubious gcc output. Can you read
29 * compiler: it said _VOLATILE_
30 * Richard Kooijman : Timestamp fixes.
31 * Alan Cox : New buffers. Use sk->mac.raw.
32 * Alan Cox : sendmsg/recvmsg support.
33 * Alan Cox : Protocol setting support
34 * Alexey Kuznetsov : Untied from IPv4 stack.
35 * Cyrus Durgin : Fixed kerneld for kmod.
36 * Michal Ostrowski : Module initialization cleanup.
37 * Ulises Alonso : Frame number limit removal and
38 * packet_set_ring memory leak.
39 * Eric Biederman : Allow for > 8 byte hardware addresses.
40 * The convention is that longer addresses
41 * will simply extend the hardware address
42 * byte arrays at the end of sockaddr_ll
43 * and packet_mreq.
45 * This program is free software; you can redistribute it and/or
46 * modify it under the terms of the GNU General Public License
47 * as published by the Free Software Foundation; either version
48 * 2 of the License, or (at your option) any later version.
52 #include <linux/types.h>
53 #include <linux/sched.h>
54 #include <linux/mm.h>
55 #include <linux/capability.h>
56 #include <linux/fcntl.h>
57 #include <linux/socket.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/if_packet.h>
62 #include <linux/wireless.h>
63 #include <linux/kernel.h>
64 #include <linux/kmod.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
87 #define CONFIG_SOCK_PACKET 1
90 Proposed replacement for SIOC{ADD,DEL}MULTI and
91 IFF_PROMISC, IFF_ALLMULTI flags.
93 It is more expensive, but I believe,
94 it is really correct solution: reentereble, safe and fault tolerant.
96 IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
97 reference count and global flag, so that real status is
98 (gflag|(count != 0)), so that we can use obsolete faulty interface
99 not harming clever users.
101 #define CONFIG_PACKET_MULTICAST 1
104 Assumptions:
105 - if device has no dev->hard_header routine, it adds and removes ll header
106 inside itself. In this case ll header is invisible outside of device,
107 but higher levels still should reserve dev->hard_header_len.
108 Some devices are enough clever to reallocate skb, when header
109 will not fit to reserved space (tunnel), another ones are silly
110 (PPP).
111 - packet socket receives packets with pulled ll header,
112 so that SOCK_RAW should push it back.
114 On receive:
115 -----------
117 Incoming, dev->hard_header!=NULL
118 mac.raw -> ll header
119 data -> data
121 Outgoing, dev->hard_header!=NULL
122 mac.raw -> ll header
123 data -> ll header
125 Incoming, dev->hard_header==NULL
126 mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
127 PPP makes it, that is wrong, because introduce assymetry
128 between rx and tx paths.
129 data -> data
131 Outgoing, dev->hard_header==NULL
132 mac.raw -> data. ll header is still not built!
133 data -> data
135 Resume
136 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 On transmit:
140 ------------
142 dev->hard_header != NULL
143 mac.raw -> ll header
144 data -> ll header
146 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 mac.raw -> data
148 data -> data
150 We should set nh.raw on output to correct posistion,
151 packet classifier depends on it.
154 /* List of all packet sockets. */
155 static HLIST_HEAD(packet_sklist);
156 static DEFINE_RWLOCK(packet_sklist_lock);
158 static atomic_t packet_socks_nr;
161 /* Private packet socket structures. */
163 #ifdef CONFIG_PACKET_MULTICAST
164 struct packet_mclist
166 struct packet_mclist *next;
167 int ifindex;
168 int count;
169 unsigned short type;
170 unsigned short alen;
171 unsigned char addr[MAX_ADDR_LEN];
173 /* identical to struct packet_mreq except it has
174 * a longer address field.
176 struct packet_mreq_max
178 int mr_ifindex;
179 unsigned short mr_type;
180 unsigned short mr_alen;
181 unsigned char mr_address[MAX_ADDR_LEN];
183 #endif
184 #ifdef CONFIG_PACKET_MMAP
185 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
186 #endif
188 static void packet_flush_mclist(struct sock *sk);
190 struct packet_sock {
191 /* struct sock has to be the first member of packet_sock */
192 struct sock sk;
193 struct tpacket_stats stats;
194 #ifdef CONFIG_PACKET_MMAP
195 char * *pg_vec;
196 unsigned int head;
197 unsigned int frames_per_block;
198 unsigned int frame_size;
199 unsigned int frame_max;
200 int copy_thresh;
201 #endif
202 struct packet_type prot_hook;
203 spinlock_t bind_lock;
204 unsigned int running:1, /* prot_hook is attached*/
205 auxdata:1;
206 int ifindex; /* bound device */
207 __be16 num;
208 #ifdef CONFIG_PACKET_MULTICAST
209 struct packet_mclist *mclist;
210 #endif
211 #ifdef CONFIG_PACKET_MMAP
212 atomic_t mapped;
213 unsigned int pg_vec_order;
214 unsigned int pg_vec_pages;
215 unsigned int pg_vec_len;
216 #endif
219 struct packet_skb_cb {
220 unsigned int origlen;
221 union {
222 struct sockaddr_pkt pkt;
223 struct sockaddr_ll ll;
224 } sa;
227 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
229 #ifdef CONFIG_PACKET_MMAP
231 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
233 unsigned int pg_vec_pos, frame_offset;
234 char *frame;
236 pg_vec_pos = position / po->frames_per_block;
237 frame_offset = position % po->frames_per_block;
239 frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
241 return frame;
243 #endif
245 static inline struct packet_sock *pkt_sk(struct sock *sk)
247 return (struct packet_sock *)sk;
250 static void packet_sock_destruct(struct sock *sk)
252 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
253 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
255 if (!sock_flag(sk, SOCK_DEAD)) {
256 printk("Attempt to release alive packet socket: %p\n", sk);
257 return;
260 atomic_dec(&packet_socks_nr);
261 #ifdef PACKET_REFCNT_DEBUG
262 printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
263 #endif
267 static const struct proto_ops packet_ops;
269 #ifdef CONFIG_SOCK_PACKET
270 static const struct proto_ops packet_ops_spkt;
272 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
274 struct sock *sk;
275 struct sockaddr_pkt *spkt;
278 * When we registered the protocol we saved the socket in the data
279 * field for just this event.
282 sk = pt->af_packet_priv;
285 * Yank back the headers [hope the device set this
286 * right or kerboom...]
288 * Incoming packets have ll header pulled,
289 * push it back.
291 * For outgoing ones skb->data == skb->mac.raw
292 * so that this procedure is noop.
295 if (skb->pkt_type == PACKET_LOOPBACK)
296 goto out;
298 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
299 goto oom;
301 /* drop any routing info */
302 dst_release(skb->dst);
303 skb->dst = NULL;
305 /* drop conntrack reference */
306 nf_reset(skb);
308 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
310 skb_push(skb, skb->data-skb->mac.raw);
313 * The SOCK_PACKET socket receives _all_ frames.
316 spkt->spkt_family = dev->type;
317 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
318 spkt->spkt_protocol = skb->protocol;
321 * Charge the memory to the socket. This is done specifically
322 * to prevent sockets using all the memory up.
325 if (sock_queue_rcv_skb(sk,skb) == 0)
326 return 0;
328 out:
329 kfree_skb(skb);
330 oom:
331 return 0;
336 * Output a raw packet to a device layer. This bypasses all the other
337 * protocol layers and you must therefore supply it with a complete frame
340 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
341 struct msghdr *msg, size_t len)
343 struct sock *sk = sock->sk;
344 struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
345 struct sk_buff *skb;
346 struct net_device *dev;
347 __be16 proto=0;
348 int err;
351 * Get and verify the address.
354 if (saddr)
356 if (msg->msg_namelen < sizeof(struct sockaddr))
357 return(-EINVAL);
358 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
359 proto=saddr->spkt_protocol;
361 else
362 return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
365 * Find the device first to size check it
368 saddr->spkt_device[13] = 0;
369 dev = dev_get_by_name(saddr->spkt_device);
370 err = -ENODEV;
371 if (dev == NULL)
372 goto out_unlock;
374 err = -ENETDOWN;
375 if (!(dev->flags & IFF_UP))
376 goto out_unlock;
379 * You may not queue a frame bigger than the mtu. This is the lowest level
380 * raw protocol and you must do your own fragmentation at this level.
383 err = -EMSGSIZE;
384 if (len > dev->mtu + dev->hard_header_len)
385 goto out_unlock;
387 err = -ENOBUFS;
388 skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
391 * If the write buffer is full, then tough. At this level the user gets to
392 * deal with the problem - do your own algorithmic backoffs. That's far
393 * more flexible.
396 if (skb == NULL)
397 goto out_unlock;
400 * Fill it in
403 /* FIXME: Save some space for broken drivers that write a
404 * hard header at transmission time by themselves. PPP is the
405 * notable one here. This should really be fixed at the driver level.
407 skb_reserve(skb, LL_RESERVED_SPACE(dev));
408 skb->nh.raw = skb->data;
410 /* Try to align data part correctly */
411 if (dev->hard_header) {
412 skb->data -= dev->hard_header_len;
413 skb->tail -= dev->hard_header_len;
414 if (len < dev->hard_header_len)
415 skb->nh.raw = skb->data;
418 /* Returns -EFAULT on error */
419 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
420 skb->protocol = proto;
421 skb->dev = dev;
422 skb->priority = sk->sk_priority;
423 if (err)
424 goto out_free;
427 * Now send it
430 dev_queue_xmit(skb);
431 dev_put(dev);
432 return(len);
434 out_free:
435 kfree_skb(skb);
436 out_unlock:
437 if (dev)
438 dev_put(dev);
439 return err;
441 #endif
443 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
444 unsigned int res)
446 struct sk_filter *filter;
448 rcu_read_lock_bh();
449 filter = rcu_dereference(sk->sk_filter);
450 if (filter != NULL)
451 res = sk_run_filter(skb, filter->insns, filter->len);
452 rcu_read_unlock_bh();
454 return res;
458 This function makes lazy skb cloning in hope that most of packets
459 are discarded by BPF.
461 Note tricky part: we DO mangle shared skb! skb->data, skb->len
462 and skb->cb are mangled. It works because (and until) packets
463 falling here are owned by current CPU. Output packets are cloned
464 by dev_queue_xmit_nit(), input packets are processed by net_bh
465 sequencially, so that if we return skb to original state on exit,
466 we will not harm anyone.
469 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
471 struct sock *sk;
472 struct sockaddr_ll *sll;
473 struct packet_sock *po;
474 u8 * skb_head = skb->data;
475 int skb_len = skb->len;
476 unsigned int snaplen, res;
478 if (skb->pkt_type == PACKET_LOOPBACK)
479 goto drop;
481 sk = pt->af_packet_priv;
482 po = pkt_sk(sk);
484 skb->dev = dev;
486 if (dev->hard_header) {
487 /* The device has an explicit notion of ll header,
488 exported to higher levels.
490 Otherwise, the device hides datails of it frame
491 structure, so that corresponding packet head
492 never delivered to user.
494 if (sk->sk_type != SOCK_DGRAM)
495 skb_push(skb, skb->data - skb->mac.raw);
496 else if (skb->pkt_type == PACKET_OUTGOING) {
497 /* Special case: outgoing packets have ll header at head */
498 skb_pull(skb, skb->nh.raw - skb->data);
502 snaplen = skb->len;
504 res = run_filter(skb, sk, snaplen);
505 if (!res)
506 goto drop_n_restore;
507 if (snaplen > res)
508 snaplen = res;
510 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
511 (unsigned)sk->sk_rcvbuf)
512 goto drop_n_acct;
514 if (skb_shared(skb)) {
515 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
516 if (nskb == NULL)
517 goto drop_n_acct;
519 if (skb_head != skb->data) {
520 skb->data = skb_head;
521 skb->len = skb_len;
523 kfree_skb(skb);
524 skb = nskb;
527 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
528 sizeof(skb->cb));
530 sll = &PACKET_SKB_CB(skb)->sa.ll;
531 sll->sll_family = AF_PACKET;
532 sll->sll_hatype = dev->type;
533 sll->sll_protocol = skb->protocol;
534 sll->sll_pkttype = skb->pkt_type;
535 sll->sll_ifindex = dev->ifindex;
536 sll->sll_halen = 0;
538 if (dev->hard_header_parse)
539 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
541 PACKET_SKB_CB(skb)->origlen = skb->len;
543 if (pskb_trim(skb, snaplen))
544 goto drop_n_acct;
546 skb_set_owner_r(skb, sk);
547 skb->dev = NULL;
548 dst_release(skb->dst);
549 skb->dst = NULL;
551 /* drop conntrack reference */
552 nf_reset(skb);
554 spin_lock(&sk->sk_receive_queue.lock);
555 po->stats.tp_packets++;
556 __skb_queue_tail(&sk->sk_receive_queue, skb);
557 spin_unlock(&sk->sk_receive_queue.lock);
558 sk->sk_data_ready(sk, skb->len);
559 return 0;
561 drop_n_acct:
562 spin_lock(&sk->sk_receive_queue.lock);
563 po->stats.tp_drops++;
564 spin_unlock(&sk->sk_receive_queue.lock);
566 drop_n_restore:
567 if (skb_head != skb->data && skb_shared(skb)) {
568 skb->data = skb_head;
569 skb->len = skb_len;
571 drop:
572 kfree_skb(skb);
573 return 0;
576 #ifdef CONFIG_PACKET_MMAP
577 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
579 struct sock *sk;
580 struct packet_sock *po;
581 struct sockaddr_ll *sll;
582 struct tpacket_hdr *h;
583 u8 * skb_head = skb->data;
584 int skb_len = skb->len;
585 unsigned int snaplen, res;
586 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
587 unsigned short macoff, netoff;
588 struct sk_buff *copy_skb = NULL;
590 if (skb->pkt_type == PACKET_LOOPBACK)
591 goto drop;
593 sk = pt->af_packet_priv;
594 po = pkt_sk(sk);
596 if (dev->hard_header) {
597 if (sk->sk_type != SOCK_DGRAM)
598 skb_push(skb, skb->data - skb->mac.raw);
599 else if (skb->pkt_type == PACKET_OUTGOING) {
600 /* Special case: outgoing packets have ll header at head */
601 skb_pull(skb, skb->nh.raw - skb->data);
605 if (skb->ip_summed == CHECKSUM_PARTIAL)
606 status |= TP_STATUS_CSUMNOTREADY;
608 snaplen = skb->len;
610 res = run_filter(skb, sk, snaplen);
611 if (!res)
612 goto drop_n_restore;
613 if (snaplen > res)
614 snaplen = res;
616 if (sk->sk_type == SOCK_DGRAM) {
617 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
618 } else {
619 unsigned maclen = skb->nh.raw - skb->data;
620 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
621 macoff = netoff - maclen;
624 if (macoff + snaplen > po->frame_size) {
625 if (po->copy_thresh &&
626 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
627 (unsigned)sk->sk_rcvbuf) {
628 if (skb_shared(skb)) {
629 copy_skb = skb_clone(skb, GFP_ATOMIC);
630 } else {
631 copy_skb = skb_get(skb);
632 skb_head = skb->data;
634 if (copy_skb)
635 skb_set_owner_r(copy_skb, sk);
637 snaplen = po->frame_size - macoff;
638 if ((int)snaplen < 0)
639 snaplen = 0;
642 spin_lock(&sk->sk_receive_queue.lock);
643 h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
645 if (h->tp_status)
646 goto ring_is_full;
647 po->head = po->head != po->frame_max ? po->head+1 : 0;
648 po->stats.tp_packets++;
649 if (copy_skb) {
650 status |= TP_STATUS_COPY;
651 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
653 if (!po->stats.tp_drops)
654 status &= ~TP_STATUS_LOSING;
655 spin_unlock(&sk->sk_receive_queue.lock);
657 skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
659 h->tp_len = skb->len;
660 h->tp_snaplen = snaplen;
661 h->tp_mac = macoff;
662 h->tp_net = netoff;
663 if (skb->tstamp.off_sec == 0) {
664 __net_timestamp(skb);
665 sock_enable_timestamp(sk);
667 h->tp_sec = skb->tstamp.off_sec;
668 h->tp_usec = skb->tstamp.off_usec;
670 sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
671 sll->sll_halen = 0;
672 if (dev->hard_header_parse)
673 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
674 sll->sll_family = AF_PACKET;
675 sll->sll_hatype = dev->type;
676 sll->sll_protocol = skb->protocol;
677 sll->sll_pkttype = skb->pkt_type;
678 sll->sll_ifindex = dev->ifindex;
680 h->tp_status = status;
681 smp_mb();
684 struct page *p_start, *p_end;
685 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
687 p_start = virt_to_page(h);
688 p_end = virt_to_page(h_end);
689 while (p_start <= p_end) {
690 flush_dcache_page(p_start);
691 p_start++;
695 sk->sk_data_ready(sk, 0);
697 drop_n_restore:
698 if (skb_head != skb->data && skb_shared(skb)) {
699 skb->data = skb_head;
700 skb->len = skb_len;
702 drop:
703 kfree_skb(skb);
704 return 0;
706 ring_is_full:
707 po->stats.tp_drops++;
708 spin_unlock(&sk->sk_receive_queue.lock);
710 sk->sk_data_ready(sk, 0);
711 if (copy_skb)
712 kfree_skb(copy_skb);
713 goto drop_n_restore;
716 #endif
719 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
720 struct msghdr *msg, size_t len)
722 struct sock *sk = sock->sk;
723 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
724 struct sk_buff *skb;
725 struct net_device *dev;
726 __be16 proto;
727 unsigned char *addr;
728 int ifindex, err, reserve = 0;
731 * Get and verify the address.
734 if (saddr == NULL) {
735 struct packet_sock *po = pkt_sk(sk);
737 ifindex = po->ifindex;
738 proto = po->num;
739 addr = NULL;
740 } else {
741 err = -EINVAL;
742 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
743 goto out;
744 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
745 goto out;
746 ifindex = saddr->sll_ifindex;
747 proto = saddr->sll_protocol;
748 addr = saddr->sll_addr;
752 dev = dev_get_by_index(ifindex);
753 err = -ENXIO;
754 if (dev == NULL)
755 goto out_unlock;
756 if (sock->type == SOCK_RAW)
757 reserve = dev->hard_header_len;
759 err = -ENETDOWN;
760 if (!(dev->flags & IFF_UP))
761 goto out_unlock;
763 err = -EMSGSIZE;
764 if (len > dev->mtu+reserve)
765 goto out_unlock;
767 skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
768 msg->msg_flags & MSG_DONTWAIT, &err);
769 if (skb==NULL)
770 goto out_unlock;
772 skb_reserve(skb, LL_RESERVED_SPACE(dev));
773 skb->nh.raw = skb->data;
775 if (dev->hard_header) {
776 int res;
777 err = -EINVAL;
778 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
779 if (sock->type != SOCK_DGRAM) {
780 skb->tail = skb->data;
781 skb->len = 0;
782 } else if (res < 0)
783 goto out_free;
786 /* Returns -EFAULT on error */
787 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
788 if (err)
789 goto out_free;
791 skb->protocol = proto;
792 skb->dev = dev;
793 skb->priority = sk->sk_priority;
796 * Now send it
799 err = dev_queue_xmit(skb);
800 if (err > 0 && (err = net_xmit_errno(err)) != 0)
801 goto out_unlock;
803 dev_put(dev);
805 return(len);
807 out_free:
808 kfree_skb(skb);
809 out_unlock:
810 if (dev)
811 dev_put(dev);
812 out:
813 return err;
817 * Close a PACKET socket. This is fairly simple. We immediately go
818 * to 'closed' state and remove our protocol entry in the device list.
821 static int packet_release(struct socket *sock)
823 struct sock *sk = sock->sk;
824 struct packet_sock *po;
826 if (!sk)
827 return 0;
829 po = pkt_sk(sk);
831 write_lock_bh(&packet_sklist_lock);
832 sk_del_node_init(sk);
833 write_unlock_bh(&packet_sklist_lock);
836 * Unhook packet receive handler.
839 if (po->running) {
841 * Remove the protocol hook
843 dev_remove_pack(&po->prot_hook);
844 po->running = 0;
845 po->num = 0;
846 __sock_put(sk);
849 #ifdef CONFIG_PACKET_MULTICAST
850 packet_flush_mclist(sk);
851 #endif
853 #ifdef CONFIG_PACKET_MMAP
854 if (po->pg_vec) {
855 struct tpacket_req req;
856 memset(&req, 0, sizeof(req));
857 packet_set_ring(sk, &req, 1);
859 #endif
862 * Now the socket is dead. No more input will appear.
865 sock_orphan(sk);
866 sock->sk = NULL;
868 /* Purge queues */
870 skb_queue_purge(&sk->sk_receive_queue);
872 sock_put(sk);
873 return 0;
877 * Attach a packet hook.
880 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
882 struct packet_sock *po = pkt_sk(sk);
884 * Detach an existing hook if present.
887 lock_sock(sk);
889 spin_lock(&po->bind_lock);
890 if (po->running) {
891 __sock_put(sk);
892 po->running = 0;
893 po->num = 0;
894 spin_unlock(&po->bind_lock);
895 dev_remove_pack(&po->prot_hook);
896 spin_lock(&po->bind_lock);
899 po->num = protocol;
900 po->prot_hook.type = protocol;
901 po->prot_hook.dev = dev;
903 po->ifindex = dev ? dev->ifindex : 0;
905 if (protocol == 0)
906 goto out_unlock;
908 if (dev) {
909 if (dev->flags&IFF_UP) {
910 dev_add_pack(&po->prot_hook);
911 sock_hold(sk);
912 po->running = 1;
913 } else {
914 sk->sk_err = ENETDOWN;
915 if (!sock_flag(sk, SOCK_DEAD))
916 sk->sk_error_report(sk);
918 } else {
919 dev_add_pack(&po->prot_hook);
920 sock_hold(sk);
921 po->running = 1;
924 out_unlock:
925 spin_unlock(&po->bind_lock);
926 release_sock(sk);
927 return 0;
931 * Bind a packet socket to a device
934 #ifdef CONFIG_SOCK_PACKET
936 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
938 struct sock *sk=sock->sk;
939 char name[15];
940 struct net_device *dev;
941 int err = -ENODEV;
944 * Check legality
947 if (addr_len != sizeof(struct sockaddr))
948 return -EINVAL;
949 strlcpy(name,uaddr->sa_data,sizeof(name));
951 dev = dev_get_by_name(name);
952 if (dev) {
953 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
954 dev_put(dev);
956 return err;
958 #endif
960 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
962 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
963 struct sock *sk=sock->sk;
964 struct net_device *dev = NULL;
965 int err;
969 * Check legality
972 if (addr_len < sizeof(struct sockaddr_ll))
973 return -EINVAL;
974 if (sll->sll_family != AF_PACKET)
975 return -EINVAL;
977 if (sll->sll_ifindex) {
978 err = -ENODEV;
979 dev = dev_get_by_index(sll->sll_ifindex);
980 if (dev == NULL)
981 goto out;
983 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
984 if (dev)
985 dev_put(dev);
987 out:
988 return err;
991 static struct proto packet_proto = {
992 .name = "PACKET",
993 .owner = THIS_MODULE,
994 .obj_size = sizeof(struct packet_sock),
998 * Create a packet of type SOCK_PACKET.
1001 static int packet_create(struct socket *sock, int protocol)
1003 struct sock *sk;
1004 struct packet_sock *po;
1005 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1006 int err;
1008 if (!capable(CAP_NET_RAW))
1009 return -EPERM;
1010 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
1011 #ifdef CONFIG_SOCK_PACKET
1012 && sock->type != SOCK_PACKET
1013 #endif
1015 return -ESOCKTNOSUPPORT;
1017 sock->state = SS_UNCONNECTED;
1019 err = -ENOBUFS;
1020 sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1021 if (sk == NULL)
1022 goto out;
1024 sock->ops = &packet_ops;
1025 #ifdef CONFIG_SOCK_PACKET
1026 if (sock->type == SOCK_PACKET)
1027 sock->ops = &packet_ops_spkt;
1028 #endif
1029 sock_init_data(sock, sk);
1031 po = pkt_sk(sk);
1032 sk->sk_family = PF_PACKET;
1033 po->num = proto;
1035 sk->sk_destruct = packet_sock_destruct;
1036 atomic_inc(&packet_socks_nr);
1039 * Attach a protocol block
1042 spin_lock_init(&po->bind_lock);
1043 po->prot_hook.func = packet_rcv;
1044 #ifdef CONFIG_SOCK_PACKET
1045 if (sock->type == SOCK_PACKET)
1046 po->prot_hook.func = packet_rcv_spkt;
1047 #endif
1048 po->prot_hook.af_packet_priv = sk;
1050 if (proto) {
1051 po->prot_hook.type = proto;
1052 dev_add_pack(&po->prot_hook);
1053 sock_hold(sk);
1054 po->running = 1;
1057 write_lock_bh(&packet_sklist_lock);
1058 sk_add_node(sk, &packet_sklist);
1059 write_unlock_bh(&packet_sklist_lock);
1060 return(0);
1061 out:
1062 return err;
1066 * Pull a packet from our receive queue and hand it to the user.
1067 * If necessary we block.
1070 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1071 struct msghdr *msg, size_t len, int flags)
1073 struct sock *sk = sock->sk;
1074 struct sk_buff *skb;
1075 int copied, err;
1076 struct sockaddr_ll *sll;
1078 err = -EINVAL;
1079 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1080 goto out;
1082 #if 0
1083 /* What error should we return now? EUNATTACH? */
1084 if (pkt_sk(sk)->ifindex < 0)
1085 return -ENODEV;
1086 #endif
1089 * Call the generic datagram receiver. This handles all sorts
1090 * of horrible races and re-entrancy so we can forget about it
1091 * in the protocol layers.
1093 * Now it will return ENETDOWN, if device have just gone down,
1094 * but then it will block.
1097 skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1100 * An error occurred so return it. Because skb_recv_datagram()
1101 * handles the blocking we don't see and worry about blocking
1102 * retries.
1105 if (skb == NULL)
1106 goto out;
1109 * If the address length field is there to be filled in, we fill
1110 * it in now.
1113 sll = &PACKET_SKB_CB(skb)->sa.ll;
1114 if (sock->type == SOCK_PACKET)
1115 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1116 else
1117 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1120 * You lose any data beyond the buffer you gave. If it worries a
1121 * user program they can ask the device for its MTU anyway.
1124 copied = skb->len;
1125 if (copied > len)
1127 copied=len;
1128 msg->msg_flags|=MSG_TRUNC;
1131 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1132 if (err)
1133 goto out_free;
1135 sock_recv_timestamp(msg, sk, skb);
1137 if (msg->msg_name)
1138 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1139 msg->msg_namelen);
1141 if (pkt_sk(sk)->auxdata) {
1142 struct tpacket_auxdata aux;
1144 aux.tp_status = TP_STATUS_USER;
1145 if (skb->ip_summed == CHECKSUM_PARTIAL)
1146 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1147 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1148 aux.tp_snaplen = skb->len;
1149 aux.tp_mac = 0;
1150 aux.tp_net = skb->nh.raw - skb->data;
1152 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1156 * Free or return the buffer as appropriate. Again this
1157 * hides all the races and re-entrancy issues from us.
1159 err = (flags&MSG_TRUNC) ? skb->len : copied;
1161 out_free:
1162 skb_free_datagram(sk, skb);
1163 out:
1164 return err;
1167 #ifdef CONFIG_SOCK_PACKET
1168 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1169 int *uaddr_len, int peer)
1171 struct net_device *dev;
1172 struct sock *sk = sock->sk;
1174 if (peer)
1175 return -EOPNOTSUPP;
1177 uaddr->sa_family = AF_PACKET;
1178 dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1179 if (dev) {
1180 strlcpy(uaddr->sa_data, dev->name, 15);
1181 dev_put(dev);
1182 } else
1183 memset(uaddr->sa_data, 0, 14);
1184 *uaddr_len = sizeof(*uaddr);
1186 return 0;
1188 #endif
1190 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1191 int *uaddr_len, int peer)
1193 struct net_device *dev;
1194 struct sock *sk = sock->sk;
1195 struct packet_sock *po = pkt_sk(sk);
1196 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1198 if (peer)
1199 return -EOPNOTSUPP;
1201 sll->sll_family = AF_PACKET;
1202 sll->sll_ifindex = po->ifindex;
1203 sll->sll_protocol = po->num;
1204 dev = dev_get_by_index(po->ifindex);
1205 if (dev) {
1206 sll->sll_hatype = dev->type;
1207 sll->sll_halen = dev->addr_len;
1208 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1209 dev_put(dev);
1210 } else {
1211 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1212 sll->sll_halen = 0;
1214 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1216 return 0;
1219 #ifdef CONFIG_PACKET_MULTICAST
1220 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1222 switch (i->type) {
1223 case PACKET_MR_MULTICAST:
1224 if (what > 0)
1225 dev_mc_add(dev, i->addr, i->alen, 0);
1226 else
1227 dev_mc_delete(dev, i->addr, i->alen, 0);
1228 break;
1229 case PACKET_MR_PROMISC:
1230 dev_set_promiscuity(dev, what);
1231 break;
1232 case PACKET_MR_ALLMULTI:
1233 dev_set_allmulti(dev, what);
1234 break;
1235 default:;
1239 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1241 for ( ; i; i=i->next) {
1242 if (i->ifindex == dev->ifindex)
1243 packet_dev_mc(dev, i, what);
1247 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1249 struct packet_sock *po = pkt_sk(sk);
1250 struct packet_mclist *ml, *i;
1251 struct net_device *dev;
1252 int err;
1254 rtnl_lock();
1256 err = -ENODEV;
1257 dev = __dev_get_by_index(mreq->mr_ifindex);
1258 if (!dev)
1259 goto done;
1261 err = -EINVAL;
1262 if (mreq->mr_alen > dev->addr_len)
1263 goto done;
1265 err = -ENOBUFS;
1266 i = kmalloc(sizeof(*i), GFP_KERNEL);
1267 if (i == NULL)
1268 goto done;
1270 err = 0;
1271 for (ml = po->mclist; ml; ml = ml->next) {
1272 if (ml->ifindex == mreq->mr_ifindex &&
1273 ml->type == mreq->mr_type &&
1274 ml->alen == mreq->mr_alen &&
1275 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1276 ml->count++;
1277 /* Free the new element ... */
1278 kfree(i);
1279 goto done;
1283 i->type = mreq->mr_type;
1284 i->ifindex = mreq->mr_ifindex;
1285 i->alen = mreq->mr_alen;
1286 memcpy(i->addr, mreq->mr_address, i->alen);
1287 i->count = 1;
1288 i->next = po->mclist;
1289 po->mclist = i;
1290 packet_dev_mc(dev, i, +1);
1292 done:
1293 rtnl_unlock();
1294 return err;
1297 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1299 struct packet_mclist *ml, **mlp;
1301 rtnl_lock();
1303 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1304 if (ml->ifindex == mreq->mr_ifindex &&
1305 ml->type == mreq->mr_type &&
1306 ml->alen == mreq->mr_alen &&
1307 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1308 if (--ml->count == 0) {
1309 struct net_device *dev;
1310 *mlp = ml->next;
1311 dev = dev_get_by_index(ml->ifindex);
1312 if (dev) {
1313 packet_dev_mc(dev, ml, -1);
1314 dev_put(dev);
1316 kfree(ml);
1318 rtnl_unlock();
1319 return 0;
1322 rtnl_unlock();
1323 return -EADDRNOTAVAIL;
1326 static void packet_flush_mclist(struct sock *sk)
1328 struct packet_sock *po = pkt_sk(sk);
1329 struct packet_mclist *ml;
1331 if (!po->mclist)
1332 return;
1334 rtnl_lock();
1335 while ((ml = po->mclist) != NULL) {
1336 struct net_device *dev;
1338 po->mclist = ml->next;
1339 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1340 packet_dev_mc(dev, ml, -1);
1341 dev_put(dev);
1343 kfree(ml);
1345 rtnl_unlock();
1347 #endif
1349 static int
1350 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1352 struct sock *sk = sock->sk;
1353 struct packet_sock *po = pkt_sk(sk);
1354 int ret;
1356 if (level != SOL_PACKET)
1357 return -ENOPROTOOPT;
1359 switch(optname) {
1360 #ifdef CONFIG_PACKET_MULTICAST
1361 case PACKET_ADD_MEMBERSHIP:
1362 case PACKET_DROP_MEMBERSHIP:
1364 struct packet_mreq_max mreq;
1365 int len = optlen;
1366 memset(&mreq, 0, sizeof(mreq));
1367 if (len < sizeof(struct packet_mreq))
1368 return -EINVAL;
1369 if (len > sizeof(mreq))
1370 len = sizeof(mreq);
1371 if (copy_from_user(&mreq,optval,len))
1372 return -EFAULT;
1373 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1374 return -EINVAL;
1375 if (optname == PACKET_ADD_MEMBERSHIP)
1376 ret = packet_mc_add(sk, &mreq);
1377 else
1378 ret = packet_mc_drop(sk, &mreq);
1379 return ret;
1381 #endif
1382 #ifdef CONFIG_PACKET_MMAP
1383 case PACKET_RX_RING:
1385 struct tpacket_req req;
1387 if (optlen<sizeof(req))
1388 return -EINVAL;
1389 if (copy_from_user(&req,optval,sizeof(req)))
1390 return -EFAULT;
1391 return packet_set_ring(sk, &req, 0);
1393 case PACKET_COPY_THRESH:
1395 int val;
1397 if (optlen!=sizeof(val))
1398 return -EINVAL;
1399 if (copy_from_user(&val,optval,sizeof(val)))
1400 return -EFAULT;
1402 pkt_sk(sk)->copy_thresh = val;
1403 return 0;
1405 #endif
1406 case PACKET_AUXDATA:
1408 int val;
1410 if (optlen < sizeof(val))
1411 return -EINVAL;
1412 if (copy_from_user(&val, optval, sizeof(val)))
1413 return -EFAULT;
1415 po->auxdata = !!val;
1416 return 0;
1418 default:
1419 return -ENOPROTOOPT;
1423 static int packet_getsockopt(struct socket *sock, int level, int optname,
1424 char __user *optval, int __user *optlen)
1426 int len;
1427 int val;
1428 struct sock *sk = sock->sk;
1429 struct packet_sock *po = pkt_sk(sk);
1430 void *data;
1431 struct tpacket_stats st;
1433 if (level != SOL_PACKET)
1434 return -ENOPROTOOPT;
1436 if (get_user(len, optlen))
1437 return -EFAULT;
1439 if (len < 0)
1440 return -EINVAL;
1442 switch(optname) {
1443 case PACKET_STATISTICS:
1444 if (len > sizeof(struct tpacket_stats))
1445 len = sizeof(struct tpacket_stats);
1446 spin_lock_bh(&sk->sk_receive_queue.lock);
1447 st = po->stats;
1448 memset(&po->stats, 0, sizeof(st));
1449 spin_unlock_bh(&sk->sk_receive_queue.lock);
1450 st.tp_packets += st.tp_drops;
1452 data = &st;
1453 break;
1454 case PACKET_AUXDATA:
1455 if (len > sizeof(int))
1456 len = sizeof(int);
1457 val = po->auxdata;
1459 data = &val;
1460 break;
1461 default:
1462 return -ENOPROTOOPT;
1465 if (put_user(len, optlen))
1466 return -EFAULT;
1467 if (copy_to_user(optval, data, len))
1468 return -EFAULT;
1469 return 0;
1473 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1475 struct sock *sk;
1476 struct hlist_node *node;
1477 struct net_device *dev = (struct net_device*)data;
1479 read_lock(&packet_sklist_lock);
1480 sk_for_each(sk, node, &packet_sklist) {
1481 struct packet_sock *po = pkt_sk(sk);
1483 switch (msg) {
1484 case NETDEV_UNREGISTER:
1485 #ifdef CONFIG_PACKET_MULTICAST
1486 if (po->mclist)
1487 packet_dev_mclist(dev, po->mclist, -1);
1488 // fallthrough
1489 #endif
1490 case NETDEV_DOWN:
1491 if (dev->ifindex == po->ifindex) {
1492 spin_lock(&po->bind_lock);
1493 if (po->running) {
1494 __dev_remove_pack(&po->prot_hook);
1495 __sock_put(sk);
1496 po->running = 0;
1497 sk->sk_err = ENETDOWN;
1498 if (!sock_flag(sk, SOCK_DEAD))
1499 sk->sk_error_report(sk);
1501 if (msg == NETDEV_UNREGISTER) {
1502 po->ifindex = -1;
1503 po->prot_hook.dev = NULL;
1505 spin_unlock(&po->bind_lock);
1507 break;
1508 case NETDEV_UP:
1509 spin_lock(&po->bind_lock);
1510 if (dev->ifindex == po->ifindex && po->num &&
1511 !po->running) {
1512 dev_add_pack(&po->prot_hook);
1513 sock_hold(sk);
1514 po->running = 1;
1516 spin_unlock(&po->bind_lock);
1517 break;
1520 read_unlock(&packet_sklist_lock);
1521 return NOTIFY_DONE;
1525 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1526 unsigned long arg)
1528 struct sock *sk = sock->sk;
1530 switch(cmd) {
1531 case SIOCOUTQ:
1533 int amount = atomic_read(&sk->sk_wmem_alloc);
1534 return put_user(amount, (int __user *)arg);
1536 case SIOCINQ:
1538 struct sk_buff *skb;
1539 int amount = 0;
1541 spin_lock_bh(&sk->sk_receive_queue.lock);
1542 skb = skb_peek(&sk->sk_receive_queue);
1543 if (skb)
1544 amount = skb->len;
1545 spin_unlock_bh(&sk->sk_receive_queue.lock);
1546 return put_user(amount, (int __user *)arg);
1548 case SIOCGSTAMP:
1549 return sock_get_timestamp(sk, (struct timeval __user *)arg);
1551 #ifdef CONFIG_INET
1552 case SIOCADDRT:
1553 case SIOCDELRT:
1554 case SIOCDARP:
1555 case SIOCGARP:
1556 case SIOCSARP:
1557 case SIOCGIFADDR:
1558 case SIOCSIFADDR:
1559 case SIOCGIFBRDADDR:
1560 case SIOCSIFBRDADDR:
1561 case SIOCGIFNETMASK:
1562 case SIOCSIFNETMASK:
1563 case SIOCGIFDSTADDR:
1564 case SIOCSIFDSTADDR:
1565 case SIOCSIFFLAGS:
1566 return inet_dgram_ops.ioctl(sock, cmd, arg);
1567 #endif
1569 default:
1570 return -ENOIOCTLCMD;
1572 return 0;
1575 #ifndef CONFIG_PACKET_MMAP
1576 #define packet_mmap sock_no_mmap
1577 #define packet_poll datagram_poll
1578 #else
1580 static unsigned int packet_poll(struct file * file, struct socket *sock,
1581 poll_table *wait)
1583 struct sock *sk = sock->sk;
1584 struct packet_sock *po = pkt_sk(sk);
1585 unsigned int mask = datagram_poll(file, sock, wait);
1587 spin_lock_bh(&sk->sk_receive_queue.lock);
1588 if (po->pg_vec) {
1589 unsigned last = po->head ? po->head-1 : po->frame_max;
1590 struct tpacket_hdr *h;
1592 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1594 if (h->tp_status)
1595 mask |= POLLIN | POLLRDNORM;
1597 spin_unlock_bh(&sk->sk_receive_queue.lock);
1598 return mask;
1602 /* Dirty? Well, I still did not learn better way to account
1603 * for user mmaps.
1606 static void packet_mm_open(struct vm_area_struct *vma)
1608 struct file *file = vma->vm_file;
1609 struct socket * sock = file->private_data;
1610 struct sock *sk = sock->sk;
1612 if (sk)
1613 atomic_inc(&pkt_sk(sk)->mapped);
1616 static void packet_mm_close(struct vm_area_struct *vma)
1618 struct file *file = vma->vm_file;
1619 struct socket * sock = file->private_data;
1620 struct sock *sk = sock->sk;
1622 if (sk)
1623 atomic_dec(&pkt_sk(sk)->mapped);
1626 static struct vm_operations_struct packet_mmap_ops = {
1627 .open = packet_mm_open,
1628 .close =packet_mm_close,
1631 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1633 return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1636 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1638 int i;
1640 for (i = 0; i < len; i++) {
1641 if (likely(pg_vec[i]))
1642 free_pages((unsigned long) pg_vec[i], order);
1644 kfree(pg_vec);
1647 static inline char *alloc_one_pg_vec_page(unsigned long order)
1649 return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1650 order);
1653 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1655 unsigned int block_nr = req->tp_block_nr;
1656 char **pg_vec;
1657 int i;
1659 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1660 if (unlikely(!pg_vec))
1661 goto out;
1663 for (i = 0; i < block_nr; i++) {
1664 pg_vec[i] = alloc_one_pg_vec_page(order);
1665 if (unlikely(!pg_vec[i]))
1666 goto out_free_pgvec;
1669 out:
1670 return pg_vec;
1672 out_free_pgvec:
1673 free_pg_vec(pg_vec, order, block_nr);
1674 pg_vec = NULL;
1675 goto out;
1678 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1680 char **pg_vec = NULL;
1681 struct packet_sock *po = pkt_sk(sk);
1682 int was_running, order = 0;
1683 __be16 num;
1684 int err = 0;
1686 if (req->tp_block_nr) {
1687 int i, l;
1689 /* Sanity tests and some calculations */
1691 if (unlikely(po->pg_vec))
1692 return -EBUSY;
1694 if (unlikely((int)req->tp_block_size <= 0))
1695 return -EINVAL;
1696 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1697 return -EINVAL;
1698 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1699 return -EINVAL;
1700 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1701 return -EINVAL;
1703 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1704 if (unlikely(po->frames_per_block <= 0))
1705 return -EINVAL;
1706 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1707 req->tp_frame_nr))
1708 return -EINVAL;
1710 err = -ENOMEM;
1711 order = get_order(req->tp_block_size);
1712 pg_vec = alloc_pg_vec(req, order);
1713 if (unlikely(!pg_vec))
1714 goto out;
1716 l = 0;
1717 for (i = 0; i < req->tp_block_nr; i++) {
1718 char *ptr = pg_vec[i];
1719 struct tpacket_hdr *header;
1720 int k;
1722 for (k = 0; k < po->frames_per_block; k++) {
1723 header = (struct tpacket_hdr *) ptr;
1724 header->tp_status = TP_STATUS_KERNEL;
1725 ptr += req->tp_frame_size;
1728 /* Done */
1729 } else {
1730 if (unlikely(req->tp_frame_nr))
1731 return -EINVAL;
1734 lock_sock(sk);
1736 /* Detach socket from network */
1737 spin_lock(&po->bind_lock);
1738 was_running = po->running;
1739 num = po->num;
1740 if (was_running) {
1741 __dev_remove_pack(&po->prot_hook);
1742 po->num = 0;
1743 po->running = 0;
1744 __sock_put(sk);
1746 spin_unlock(&po->bind_lock);
1748 synchronize_net();
1750 err = -EBUSY;
1751 if (closing || atomic_read(&po->mapped) == 0) {
1752 err = 0;
1753 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1755 spin_lock_bh(&sk->sk_receive_queue.lock);
1756 pg_vec = XC(po->pg_vec, pg_vec);
1757 po->frame_max = (req->tp_frame_nr - 1);
1758 po->head = 0;
1759 po->frame_size = req->tp_frame_size;
1760 spin_unlock_bh(&sk->sk_receive_queue.lock);
1762 order = XC(po->pg_vec_order, order);
1763 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1765 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1766 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1767 skb_queue_purge(&sk->sk_receive_queue);
1768 #undef XC
1769 if (atomic_read(&po->mapped))
1770 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1773 spin_lock(&po->bind_lock);
1774 if (was_running && !po->running) {
1775 sock_hold(sk);
1776 po->running = 1;
1777 po->num = num;
1778 dev_add_pack(&po->prot_hook);
1780 spin_unlock(&po->bind_lock);
1782 release_sock(sk);
1784 if (pg_vec)
1785 free_pg_vec(pg_vec, order, req->tp_block_nr);
1786 out:
1787 return err;
1790 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1792 struct sock *sk = sock->sk;
1793 struct packet_sock *po = pkt_sk(sk);
1794 unsigned long size;
1795 unsigned long start;
1796 int err = -EINVAL;
1797 int i;
1799 if (vma->vm_pgoff)
1800 return -EINVAL;
1802 size = vma->vm_end - vma->vm_start;
1804 lock_sock(sk);
1805 if (po->pg_vec == NULL)
1806 goto out;
1807 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1808 goto out;
1810 start = vma->vm_start;
1811 for (i = 0; i < po->pg_vec_len; i++) {
1812 struct page *page = virt_to_page(po->pg_vec[i]);
1813 int pg_num;
1815 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1816 err = vm_insert_page(vma, start, page);
1817 if (unlikely(err))
1818 goto out;
1819 start += PAGE_SIZE;
1822 atomic_inc(&po->mapped);
1823 vma->vm_ops = &packet_mmap_ops;
1824 err = 0;
1826 out:
1827 release_sock(sk);
1828 return err;
1830 #endif
1833 #ifdef CONFIG_SOCK_PACKET
1834 static const struct proto_ops packet_ops_spkt = {
1835 .family = PF_PACKET,
1836 .owner = THIS_MODULE,
1837 .release = packet_release,
1838 .bind = packet_bind_spkt,
1839 .connect = sock_no_connect,
1840 .socketpair = sock_no_socketpair,
1841 .accept = sock_no_accept,
1842 .getname = packet_getname_spkt,
1843 .poll = datagram_poll,
1844 .ioctl = packet_ioctl,
1845 .listen = sock_no_listen,
1846 .shutdown = sock_no_shutdown,
1847 .setsockopt = sock_no_setsockopt,
1848 .getsockopt = sock_no_getsockopt,
1849 .sendmsg = packet_sendmsg_spkt,
1850 .recvmsg = packet_recvmsg,
1851 .mmap = sock_no_mmap,
1852 .sendpage = sock_no_sendpage,
1854 #endif
1856 static const struct proto_ops packet_ops = {
1857 .family = PF_PACKET,
1858 .owner = THIS_MODULE,
1859 .release = packet_release,
1860 .bind = packet_bind,
1861 .connect = sock_no_connect,
1862 .socketpair = sock_no_socketpair,
1863 .accept = sock_no_accept,
1864 .getname = packet_getname,
1865 .poll = packet_poll,
1866 .ioctl = packet_ioctl,
1867 .listen = sock_no_listen,
1868 .shutdown = sock_no_shutdown,
1869 .setsockopt = packet_setsockopt,
1870 .getsockopt = packet_getsockopt,
1871 .sendmsg = packet_sendmsg,
1872 .recvmsg = packet_recvmsg,
1873 .mmap = packet_mmap,
1874 .sendpage = sock_no_sendpage,
1877 static struct net_proto_family packet_family_ops = {
1878 .family = PF_PACKET,
1879 .create = packet_create,
1880 .owner = THIS_MODULE,
1883 static struct notifier_block packet_netdev_notifier = {
1884 .notifier_call =packet_notifier,
1887 #ifdef CONFIG_PROC_FS
1888 static inline struct sock *packet_seq_idx(loff_t off)
1890 struct sock *s;
1891 struct hlist_node *node;
1893 sk_for_each(s, node, &packet_sklist) {
1894 if (!off--)
1895 return s;
1897 return NULL;
1900 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1902 read_lock(&packet_sklist_lock);
1903 return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1906 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1908 ++*pos;
1909 return (v == SEQ_START_TOKEN)
1910 ? sk_head(&packet_sklist)
1911 : sk_next((struct sock*)v) ;
1914 static void packet_seq_stop(struct seq_file *seq, void *v)
1916 read_unlock(&packet_sklist_lock);
1919 static int packet_seq_show(struct seq_file *seq, void *v)
1921 if (v == SEQ_START_TOKEN)
1922 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
1923 else {
1924 struct sock *s = v;
1925 const struct packet_sock *po = pkt_sk(s);
1927 seq_printf(seq,
1928 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1930 atomic_read(&s->sk_refcnt),
1931 s->sk_type,
1932 ntohs(po->num),
1933 po->ifindex,
1934 po->running,
1935 atomic_read(&s->sk_rmem_alloc),
1936 sock_i_uid(s),
1937 sock_i_ino(s) );
1940 return 0;
1943 static struct seq_operations packet_seq_ops = {
1944 .start = packet_seq_start,
1945 .next = packet_seq_next,
1946 .stop = packet_seq_stop,
1947 .show = packet_seq_show,
1950 static int packet_seq_open(struct inode *inode, struct file *file)
1952 return seq_open(file, &packet_seq_ops);
1955 static const struct file_operations packet_seq_fops = {
1956 .owner = THIS_MODULE,
1957 .open = packet_seq_open,
1958 .read = seq_read,
1959 .llseek = seq_lseek,
1960 .release = seq_release,
1963 #endif
1965 static void __exit packet_exit(void)
1967 proc_net_remove("packet");
1968 unregister_netdevice_notifier(&packet_netdev_notifier);
1969 sock_unregister(PF_PACKET);
1970 proto_unregister(&packet_proto);
1973 static int __init packet_init(void)
1975 int rc = proto_register(&packet_proto, 0);
1977 if (rc != 0)
1978 goto out;
1980 sock_register(&packet_family_ops);
1981 register_netdevice_notifier(&packet_netdev_notifier);
1982 proc_net_fops_create("packet", 0, &packet_seq_fops);
1983 out:
1984 return rc;
1987 module_init(packet_init);
1988 module_exit(packet_exit);
1989 MODULE_LICENSE("GPL");
1990 MODULE_ALIAS_NETPROTO(PF_PACKET);