Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
[linux-2.6.git] / net / packet / af_packet.c
blob031a5e6fb4aa9c8dfc64f7a780cdb15496eb759f
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Fixes:
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq.
42 * Johann Baudy : Added TX RING.
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83 #include <linux/virtio_net.h>
85 #ifdef CONFIG_INET
86 #include <net/inet_common.h>
87 #endif
90 Assumptions:
91 - if device has no dev->hard_header routine, it adds and removes ll header
92 inside itself. In this case ll header is invisible outside of device,
93 but higher levels still should reserve dev->hard_header_len.
94 Some devices are enough clever to reallocate skb, when header
95 will not fit to reserved space (tunnel), another ones are silly
96 (PPP).
97 - packet socket receives packets with pulled ll header,
98 so that SOCK_RAW should push it back.
100 On receive:
101 -----------
103 Incoming, dev->hard_header!=NULL
104 mac_header -> ll header
105 data -> data
107 Outgoing, dev->hard_header!=NULL
108 mac_header -> ll header
109 data -> ll header
111 Incoming, dev->hard_header==NULL
112 mac_header -> UNKNOWN position. It is very likely, that it points to ll
113 header. PPP makes it, that is wrong, because introduce
114 assymetry between rx and tx paths.
115 data -> data
117 Outgoing, dev->hard_header==NULL
118 mac_header -> data. ll header is still not built!
119 data -> data
121 Resume
122 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
125 On transmit:
126 ------------
128 dev->hard_header != NULL
129 mac_header -> ll header
130 data -> ll header
132 dev->hard_header == NULL (ll header is added by device, we cannot control it)
133 mac_header -> data
134 data -> data
136 We should set nh.raw on output to correct posistion,
137 packet classifier depends on it.
140 /* Private packet socket structures. */
142 struct packet_mclist {
143 struct packet_mclist *next;
144 int ifindex;
145 int count;
146 unsigned short type;
147 unsigned short alen;
148 unsigned char addr[MAX_ADDR_LEN];
150 /* identical to struct packet_mreq except it has
151 * a longer address field.
153 struct packet_mreq_max {
154 int mr_ifindex;
155 unsigned short mr_type;
156 unsigned short mr_alen;
157 unsigned char mr_address[MAX_ADDR_LEN];
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161 int closing, int tx_ring);
163 struct packet_ring_buffer {
164 char **pg_vec;
165 unsigned int head;
166 unsigned int frames_per_block;
167 unsigned int frame_size;
168 unsigned int frame_max;
170 unsigned int pg_vec_order;
171 unsigned int pg_vec_pages;
172 unsigned int pg_vec_len;
174 atomic_t pending;
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180 static void packet_flush_mclist(struct sock *sk);
182 struct packet_sock {
183 /* struct sock has to be the first member of packet_sock */
184 struct sock sk;
185 struct tpacket_stats stats;
186 struct packet_ring_buffer rx_ring;
187 struct packet_ring_buffer tx_ring;
188 int copy_thresh;
189 spinlock_t bind_lock;
190 struct mutex pg_vec_lock;
191 unsigned int running:1, /* prot_hook is attached*/
192 auxdata:1,
193 origdev:1,
194 has_vnet_hdr:1;
195 int ifindex; /* bound device */
196 __be16 num;
197 struct packet_mclist *mclist;
198 atomic_t mapped;
199 enum tpacket_versions tp_version;
200 unsigned int tp_hdrlen;
201 unsigned int tp_reserve;
202 unsigned int tp_loss:1;
203 struct packet_type prot_hook ____cacheline_aligned_in_smp;
206 struct packet_skb_cb {
207 unsigned int origlen;
208 union {
209 struct sockaddr_pkt pkt;
210 struct sockaddr_ll ll;
211 } sa;
214 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
216 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
218 union {
219 struct tpacket_hdr *h1;
220 struct tpacket2_hdr *h2;
221 void *raw;
222 } h;
224 h.raw = frame;
225 switch (po->tp_version) {
226 case TPACKET_V1:
227 h.h1->tp_status = status;
228 flush_dcache_page(virt_to_page(&h.h1->tp_status));
229 break;
230 case TPACKET_V2:
231 h.h2->tp_status = status;
232 flush_dcache_page(virt_to_page(&h.h2->tp_status));
233 break;
234 default:
235 pr_err("TPACKET version not supported\n");
236 BUG();
239 smp_wmb();
242 static int __packet_get_status(struct packet_sock *po, void *frame)
244 union {
245 struct tpacket_hdr *h1;
246 struct tpacket2_hdr *h2;
247 void *raw;
248 } h;
250 smp_rmb();
252 h.raw = frame;
253 switch (po->tp_version) {
254 case TPACKET_V1:
255 flush_dcache_page(virt_to_page(&h.h1->tp_status));
256 return h.h1->tp_status;
257 case TPACKET_V2:
258 flush_dcache_page(virt_to_page(&h.h2->tp_status));
259 return h.h2->tp_status;
260 default:
261 pr_err("TPACKET version not supported\n");
262 BUG();
263 return 0;
267 static void *packet_lookup_frame(struct packet_sock *po,
268 struct packet_ring_buffer *rb,
269 unsigned int position,
270 int status)
272 unsigned int pg_vec_pos, frame_offset;
273 union {
274 struct tpacket_hdr *h1;
275 struct tpacket2_hdr *h2;
276 void *raw;
277 } h;
279 pg_vec_pos = position / rb->frames_per_block;
280 frame_offset = position % rb->frames_per_block;
282 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
284 if (status != __packet_get_status(po, h.raw))
285 return NULL;
287 return h.raw;
290 static inline void *packet_current_frame(struct packet_sock *po,
291 struct packet_ring_buffer *rb,
292 int status)
294 return packet_lookup_frame(po, rb, rb->head, status);
297 static inline void *packet_previous_frame(struct packet_sock *po,
298 struct packet_ring_buffer *rb,
299 int status)
301 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
302 return packet_lookup_frame(po, rb, previous, status);
305 static inline void packet_increment_head(struct packet_ring_buffer *buff)
307 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
310 static inline struct packet_sock *pkt_sk(struct sock *sk)
312 return (struct packet_sock *)sk;
315 static void packet_sock_destruct(struct sock *sk)
317 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
318 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
320 if (!sock_flag(sk, SOCK_DEAD)) {
321 pr_err("Attempt to release alive packet socket: %p\n", sk);
322 return;
325 sk_refcnt_debug_dec(sk);
329 static const struct proto_ops packet_ops;
331 static const struct proto_ops packet_ops_spkt;
333 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
334 struct packet_type *pt, struct net_device *orig_dev)
336 struct sock *sk;
337 struct sockaddr_pkt *spkt;
340 * When we registered the protocol we saved the socket in the data
341 * field for just this event.
344 sk = pt->af_packet_priv;
347 * Yank back the headers [hope the device set this
348 * right or kerboom...]
350 * Incoming packets have ll header pulled,
351 * push it back.
353 * For outgoing ones skb->data == skb_mac_header(skb)
354 * so that this procedure is noop.
357 if (skb->pkt_type == PACKET_LOOPBACK)
358 goto out;
360 if (!net_eq(dev_net(dev), sock_net(sk)))
361 goto out;
363 skb = skb_share_check(skb, GFP_ATOMIC);
364 if (skb == NULL)
365 goto oom;
367 /* drop any routing info */
368 skb_dst_drop(skb);
370 /* drop conntrack reference */
371 nf_reset(skb);
373 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
375 skb_push(skb, skb->data - skb_mac_header(skb));
378 * The SOCK_PACKET socket receives _all_ frames.
381 spkt->spkt_family = dev->type;
382 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
383 spkt->spkt_protocol = skb->protocol;
386 * Charge the memory to the socket. This is done specifically
387 * to prevent sockets using all the memory up.
390 if (sock_queue_rcv_skb(sk, skb) == 0)
391 return 0;
393 out:
394 kfree_skb(skb);
395 oom:
396 return 0;
401 * Output a raw packet to a device layer. This bypasses all the other
402 * protocol layers and you must therefore supply it with a complete frame
405 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
406 struct msghdr *msg, size_t len)
408 struct sock *sk = sock->sk;
409 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
410 struct sk_buff *skb = NULL;
411 struct net_device *dev;
412 __be16 proto = 0;
413 int err;
416 * Get and verify the address.
419 if (saddr) {
420 if (msg->msg_namelen < sizeof(struct sockaddr))
421 return -EINVAL;
422 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
423 proto = saddr->spkt_protocol;
424 } else
425 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
428 * Find the device first to size check it
431 saddr->spkt_device[13] = 0;
432 retry:
433 rcu_read_lock();
434 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
435 err = -ENODEV;
436 if (dev == NULL)
437 goto out_unlock;
439 err = -ENETDOWN;
440 if (!(dev->flags & IFF_UP))
441 goto out_unlock;
444 * You may not queue a frame bigger than the mtu. This is the lowest level
445 * raw protocol and you must do your own fragmentation at this level.
448 err = -EMSGSIZE;
449 if (len > dev->mtu + dev->hard_header_len)
450 goto out_unlock;
452 if (!skb) {
453 size_t reserved = LL_RESERVED_SPACE(dev);
454 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
456 rcu_read_unlock();
457 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
458 if (skb == NULL)
459 return -ENOBUFS;
460 /* FIXME: Save some space for broken drivers that write a hard
461 * header at transmission time by themselves. PPP is the notable
462 * one here. This should really be fixed at the driver level.
464 skb_reserve(skb, reserved);
465 skb_reset_network_header(skb);
467 /* Try to align data part correctly */
468 if (hhlen) {
469 skb->data -= hhlen;
470 skb->tail -= hhlen;
471 if (len < hhlen)
472 skb_reset_network_header(skb);
474 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
475 if (err)
476 goto out_free;
477 goto retry;
481 skb->protocol = proto;
482 skb->dev = dev;
483 skb->priority = sk->sk_priority;
484 skb->mark = sk->sk_mark;
486 dev_queue_xmit(skb);
487 rcu_read_unlock();
488 return len;
490 out_unlock:
491 rcu_read_unlock();
492 out_free:
493 kfree_skb(skb);
494 return err;
497 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
498 unsigned int res)
500 struct sk_filter *filter;
502 rcu_read_lock_bh();
503 filter = rcu_dereference_bh(sk->sk_filter);
504 if (filter != NULL)
505 res = sk_run_filter(skb, filter->insns, filter->len);
506 rcu_read_unlock_bh();
508 return res;
512 This function makes lazy skb cloning in hope that most of packets
513 are discarded by BPF.
515 Note tricky part: we DO mangle shared skb! skb->data, skb->len
516 and skb->cb are mangled. It works because (and until) packets
517 falling here are owned by current CPU. Output packets are cloned
518 by dev_queue_xmit_nit(), input packets are processed by net_bh
519 sequencially, so that if we return skb to original state on exit,
520 we will not harm anyone.
523 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
524 struct packet_type *pt, struct net_device *orig_dev)
526 struct sock *sk;
527 struct sockaddr_ll *sll;
528 struct packet_sock *po;
529 u8 *skb_head = skb->data;
530 int skb_len = skb->len;
531 unsigned int snaplen, res;
533 if (skb->pkt_type == PACKET_LOOPBACK)
534 goto drop;
536 sk = pt->af_packet_priv;
537 po = pkt_sk(sk);
539 if (!net_eq(dev_net(dev), sock_net(sk)))
540 goto drop;
542 skb->dev = dev;
544 if (dev->header_ops) {
545 /* The device has an explicit notion of ll header,
546 exported to higher levels.
548 Otherwise, the device hides datails of it frame
549 structure, so that corresponding packet head
550 never delivered to user.
552 if (sk->sk_type != SOCK_DGRAM)
553 skb_push(skb, skb->data - skb_mac_header(skb));
554 else if (skb->pkt_type == PACKET_OUTGOING) {
555 /* Special case: outgoing packets have ll header at head */
556 skb_pull(skb, skb_network_offset(skb));
560 snaplen = skb->len;
562 res = run_filter(skb, sk, snaplen);
563 if (!res)
564 goto drop_n_restore;
565 if (snaplen > res)
566 snaplen = res;
568 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
569 (unsigned)sk->sk_rcvbuf)
570 goto drop_n_acct;
572 if (skb_shared(skb)) {
573 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
574 if (nskb == NULL)
575 goto drop_n_acct;
577 if (skb_head != skb->data) {
578 skb->data = skb_head;
579 skb->len = skb_len;
581 kfree_skb(skb);
582 skb = nskb;
585 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
586 sizeof(skb->cb));
588 sll = &PACKET_SKB_CB(skb)->sa.ll;
589 sll->sll_family = AF_PACKET;
590 sll->sll_hatype = dev->type;
591 sll->sll_protocol = skb->protocol;
592 sll->sll_pkttype = skb->pkt_type;
593 if (unlikely(po->origdev))
594 sll->sll_ifindex = orig_dev->ifindex;
595 else
596 sll->sll_ifindex = dev->ifindex;
598 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
600 PACKET_SKB_CB(skb)->origlen = skb->len;
602 if (pskb_trim(skb, snaplen))
603 goto drop_n_acct;
605 skb_set_owner_r(skb, sk);
606 skb->dev = NULL;
607 skb_dst_drop(skb);
609 /* drop conntrack reference */
610 nf_reset(skb);
612 spin_lock(&sk->sk_receive_queue.lock);
613 po->stats.tp_packets++;
614 skb->dropcount = atomic_read(&sk->sk_drops);
615 __skb_queue_tail(&sk->sk_receive_queue, skb);
616 spin_unlock(&sk->sk_receive_queue.lock);
617 sk->sk_data_ready(sk, skb->len);
618 return 0;
620 drop_n_acct:
621 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
623 drop_n_restore:
624 if (skb_head != skb->data && skb_shared(skb)) {
625 skb->data = skb_head;
626 skb->len = skb_len;
628 drop:
629 consume_skb(skb);
630 return 0;
633 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
634 struct packet_type *pt, struct net_device *orig_dev)
636 struct sock *sk;
637 struct packet_sock *po;
638 struct sockaddr_ll *sll;
639 union {
640 struct tpacket_hdr *h1;
641 struct tpacket2_hdr *h2;
642 void *raw;
643 } h;
644 u8 *skb_head = skb->data;
645 int skb_len = skb->len;
646 unsigned int snaplen, res;
647 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
648 unsigned short macoff, netoff, hdrlen;
649 struct sk_buff *copy_skb = NULL;
650 struct timeval tv;
651 struct timespec ts;
653 if (skb->pkt_type == PACKET_LOOPBACK)
654 goto drop;
656 sk = pt->af_packet_priv;
657 po = pkt_sk(sk);
659 if (!net_eq(dev_net(dev), sock_net(sk)))
660 goto drop;
662 if (dev->header_ops) {
663 if (sk->sk_type != SOCK_DGRAM)
664 skb_push(skb, skb->data - skb_mac_header(skb));
665 else if (skb->pkt_type == PACKET_OUTGOING) {
666 /* Special case: outgoing packets have ll header at head */
667 skb_pull(skb, skb_network_offset(skb));
671 if (skb->ip_summed == CHECKSUM_PARTIAL)
672 status |= TP_STATUS_CSUMNOTREADY;
674 snaplen = skb->len;
676 res = run_filter(skb, sk, snaplen);
677 if (!res)
678 goto drop_n_restore;
679 if (snaplen > res)
680 snaplen = res;
682 if (sk->sk_type == SOCK_DGRAM) {
683 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
684 po->tp_reserve;
685 } else {
686 unsigned maclen = skb_network_offset(skb);
687 netoff = TPACKET_ALIGN(po->tp_hdrlen +
688 (maclen < 16 ? 16 : maclen)) +
689 po->tp_reserve;
690 macoff = netoff - maclen;
693 if (macoff + snaplen > po->rx_ring.frame_size) {
694 if (po->copy_thresh &&
695 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
696 (unsigned)sk->sk_rcvbuf) {
697 if (skb_shared(skb)) {
698 copy_skb = skb_clone(skb, GFP_ATOMIC);
699 } else {
700 copy_skb = skb_get(skb);
701 skb_head = skb->data;
703 if (copy_skb)
704 skb_set_owner_r(copy_skb, sk);
706 snaplen = po->rx_ring.frame_size - macoff;
707 if ((int)snaplen < 0)
708 snaplen = 0;
711 spin_lock(&sk->sk_receive_queue.lock);
712 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
713 if (!h.raw)
714 goto ring_is_full;
715 packet_increment_head(&po->rx_ring);
716 po->stats.tp_packets++;
717 if (copy_skb) {
718 status |= TP_STATUS_COPY;
719 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
721 if (!po->stats.tp_drops)
722 status &= ~TP_STATUS_LOSING;
723 spin_unlock(&sk->sk_receive_queue.lock);
725 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
727 switch (po->tp_version) {
728 case TPACKET_V1:
729 h.h1->tp_len = skb->len;
730 h.h1->tp_snaplen = snaplen;
731 h.h1->tp_mac = macoff;
732 h.h1->tp_net = netoff;
733 if (skb->tstamp.tv64)
734 tv = ktime_to_timeval(skb->tstamp);
735 else
736 do_gettimeofday(&tv);
737 h.h1->tp_sec = tv.tv_sec;
738 h.h1->tp_usec = tv.tv_usec;
739 hdrlen = sizeof(*h.h1);
740 break;
741 case TPACKET_V2:
742 h.h2->tp_len = skb->len;
743 h.h2->tp_snaplen = snaplen;
744 h.h2->tp_mac = macoff;
745 h.h2->tp_net = netoff;
746 if (skb->tstamp.tv64)
747 ts = ktime_to_timespec(skb->tstamp);
748 else
749 getnstimeofday(&ts);
750 h.h2->tp_sec = ts.tv_sec;
751 h.h2->tp_nsec = ts.tv_nsec;
752 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
753 hdrlen = sizeof(*h.h2);
754 break;
755 default:
756 BUG();
759 sll = h.raw + TPACKET_ALIGN(hdrlen);
760 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
761 sll->sll_family = AF_PACKET;
762 sll->sll_hatype = dev->type;
763 sll->sll_protocol = skb->protocol;
764 sll->sll_pkttype = skb->pkt_type;
765 if (unlikely(po->origdev))
766 sll->sll_ifindex = orig_dev->ifindex;
767 else
768 sll->sll_ifindex = dev->ifindex;
770 __packet_set_status(po, h.raw, status);
771 smp_mb();
773 struct page *p_start, *p_end;
774 u8 *h_end = h.raw + macoff + snaplen - 1;
776 p_start = virt_to_page(h.raw);
777 p_end = virt_to_page(h_end);
778 while (p_start <= p_end) {
779 flush_dcache_page(p_start);
780 p_start++;
784 sk->sk_data_ready(sk, 0);
786 drop_n_restore:
787 if (skb_head != skb->data && skb_shared(skb)) {
788 skb->data = skb_head;
789 skb->len = skb_len;
791 drop:
792 kfree_skb(skb);
793 return 0;
795 ring_is_full:
796 po->stats.tp_drops++;
797 spin_unlock(&sk->sk_receive_queue.lock);
799 sk->sk_data_ready(sk, 0);
800 kfree_skb(copy_skb);
801 goto drop_n_restore;
804 static void tpacket_destruct_skb(struct sk_buff *skb)
806 struct packet_sock *po = pkt_sk(skb->sk);
807 void *ph;
809 BUG_ON(skb == NULL);
811 if (likely(po->tx_ring.pg_vec)) {
812 ph = skb_shinfo(skb)->destructor_arg;
813 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
814 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
815 atomic_dec(&po->tx_ring.pending);
816 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
819 sock_wfree(skb);
822 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
823 void *frame, struct net_device *dev, int size_max,
824 __be16 proto, unsigned char *addr)
826 union {
827 struct tpacket_hdr *h1;
828 struct tpacket2_hdr *h2;
829 void *raw;
830 } ph;
831 int to_write, offset, len, tp_len, nr_frags, len_max;
832 struct socket *sock = po->sk.sk_socket;
833 struct page *page;
834 void *data;
835 int err;
837 ph.raw = frame;
839 skb->protocol = proto;
840 skb->dev = dev;
841 skb->priority = po->sk.sk_priority;
842 skb->mark = po->sk.sk_mark;
843 skb_shinfo(skb)->destructor_arg = ph.raw;
845 switch (po->tp_version) {
846 case TPACKET_V2:
847 tp_len = ph.h2->tp_len;
848 break;
849 default:
850 tp_len = ph.h1->tp_len;
851 break;
853 if (unlikely(tp_len > size_max)) {
854 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
855 return -EMSGSIZE;
858 skb_reserve(skb, LL_RESERVED_SPACE(dev));
859 skb_reset_network_header(skb);
861 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
862 to_write = tp_len;
864 if (sock->type == SOCK_DGRAM) {
865 err = dev_hard_header(skb, dev, ntohs(proto), addr,
866 NULL, tp_len);
867 if (unlikely(err < 0))
868 return -EINVAL;
869 } else if (dev->hard_header_len) {
870 /* net device doesn't like empty head */
871 if (unlikely(tp_len <= dev->hard_header_len)) {
872 pr_err("packet size is too short (%d < %d)\n",
873 tp_len, dev->hard_header_len);
874 return -EINVAL;
877 skb_push(skb, dev->hard_header_len);
878 err = skb_store_bits(skb, 0, data,
879 dev->hard_header_len);
880 if (unlikely(err))
881 return err;
883 data += dev->hard_header_len;
884 to_write -= dev->hard_header_len;
887 err = -EFAULT;
888 page = virt_to_page(data);
889 offset = offset_in_page(data);
890 len_max = PAGE_SIZE - offset;
891 len = ((to_write > len_max) ? len_max : to_write);
893 skb->data_len = to_write;
894 skb->len += to_write;
895 skb->truesize += to_write;
896 atomic_add(to_write, &po->sk.sk_wmem_alloc);
898 while (likely(to_write)) {
899 nr_frags = skb_shinfo(skb)->nr_frags;
901 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
902 pr_err("Packet exceed the number of skb frags(%lu)\n",
903 MAX_SKB_FRAGS);
904 return -EFAULT;
907 flush_dcache_page(page);
908 get_page(page);
909 skb_fill_page_desc(skb,
910 nr_frags,
911 page++, offset, len);
912 to_write -= len;
913 offset = 0;
914 len_max = PAGE_SIZE;
915 len = ((to_write > len_max) ? len_max : to_write);
918 return tp_len;
921 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
923 struct socket *sock;
924 struct sk_buff *skb;
925 struct net_device *dev;
926 __be16 proto;
927 int ifindex, err, reserve = 0;
928 void *ph;
929 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
930 int tp_len, size_max;
931 unsigned char *addr;
932 int len_sum = 0;
933 int status = 0;
935 sock = po->sk.sk_socket;
937 mutex_lock(&po->pg_vec_lock);
939 err = -EBUSY;
940 if (saddr == NULL) {
941 ifindex = po->ifindex;
942 proto = po->num;
943 addr = NULL;
944 } else {
945 err = -EINVAL;
946 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
947 goto out;
948 if (msg->msg_namelen < (saddr->sll_halen
949 + offsetof(struct sockaddr_ll,
950 sll_addr)))
951 goto out;
952 ifindex = saddr->sll_ifindex;
953 proto = saddr->sll_protocol;
954 addr = saddr->sll_addr;
957 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
958 err = -ENXIO;
959 if (unlikely(dev == NULL))
960 goto out;
962 reserve = dev->hard_header_len;
964 err = -ENETDOWN;
965 if (unlikely(!(dev->flags & IFF_UP)))
966 goto out_put;
968 size_max = po->tx_ring.frame_size
969 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
971 if (size_max > dev->mtu + reserve)
972 size_max = dev->mtu + reserve;
974 do {
975 ph = packet_current_frame(po, &po->tx_ring,
976 TP_STATUS_SEND_REQUEST);
978 if (unlikely(ph == NULL)) {
979 schedule();
980 continue;
983 status = TP_STATUS_SEND_REQUEST;
984 skb = sock_alloc_send_skb(&po->sk,
985 LL_ALLOCATED_SPACE(dev)
986 + sizeof(struct sockaddr_ll),
987 0, &err);
989 if (unlikely(skb == NULL))
990 goto out_status;
992 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
993 addr);
995 if (unlikely(tp_len < 0)) {
996 if (po->tp_loss) {
997 __packet_set_status(po, ph,
998 TP_STATUS_AVAILABLE);
999 packet_increment_head(&po->tx_ring);
1000 kfree_skb(skb);
1001 continue;
1002 } else {
1003 status = TP_STATUS_WRONG_FORMAT;
1004 err = tp_len;
1005 goto out_status;
1009 skb->destructor = tpacket_destruct_skb;
1010 __packet_set_status(po, ph, TP_STATUS_SENDING);
1011 atomic_inc(&po->tx_ring.pending);
1013 status = TP_STATUS_SEND_REQUEST;
1014 err = dev_queue_xmit(skb);
1015 if (unlikely(err > 0)) {
1016 err = net_xmit_errno(err);
1017 if (err && __packet_get_status(po, ph) ==
1018 TP_STATUS_AVAILABLE) {
1019 /* skb was destructed already */
1020 skb = NULL;
1021 goto out_status;
1024 * skb was dropped but not destructed yet;
1025 * let's treat it like congestion or err < 0
1027 err = 0;
1029 packet_increment_head(&po->tx_ring);
1030 len_sum += tp_len;
1031 } while (likely((ph != NULL) ||
1032 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1033 (atomic_read(&po->tx_ring.pending))))
1036 err = len_sum;
1037 goto out_put;
1039 out_status:
1040 __packet_set_status(po, ph, status);
1041 kfree_skb(skb);
1042 out_put:
1043 dev_put(dev);
1044 out:
1045 mutex_unlock(&po->pg_vec_lock);
1046 return err;
1049 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1050 size_t reserve, size_t len,
1051 size_t linear, int noblock,
1052 int *err)
1054 struct sk_buff *skb;
1056 /* Under a page? Don't bother with paged skb. */
1057 if (prepad + len < PAGE_SIZE || !linear)
1058 linear = len;
1060 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1061 err);
1062 if (!skb)
1063 return NULL;
1065 skb_reserve(skb, reserve);
1066 skb_put(skb, linear);
1067 skb->data_len = len - linear;
1068 skb->len += len - linear;
1070 return skb;
1073 static int packet_snd(struct socket *sock,
1074 struct msghdr *msg, size_t len)
1076 struct sock *sk = sock->sk;
1077 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1078 struct sk_buff *skb;
1079 struct net_device *dev;
1080 __be16 proto;
1081 unsigned char *addr;
1082 int ifindex, err, reserve = 0;
1083 struct virtio_net_hdr vnet_hdr = { 0 };
1084 int offset = 0;
1085 int vnet_hdr_len;
1086 struct packet_sock *po = pkt_sk(sk);
1087 unsigned short gso_type = 0;
1090 * Get and verify the address.
1093 if (saddr == NULL) {
1094 ifindex = po->ifindex;
1095 proto = po->num;
1096 addr = NULL;
1097 } else {
1098 err = -EINVAL;
1099 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1100 goto out;
1101 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1102 goto out;
1103 ifindex = saddr->sll_ifindex;
1104 proto = saddr->sll_protocol;
1105 addr = saddr->sll_addr;
1109 dev = dev_get_by_index(sock_net(sk), ifindex);
1110 err = -ENXIO;
1111 if (dev == NULL)
1112 goto out_unlock;
1113 if (sock->type == SOCK_RAW)
1114 reserve = dev->hard_header_len;
1116 err = -ENETDOWN;
1117 if (!(dev->flags & IFF_UP))
1118 goto out_unlock;
1120 if (po->has_vnet_hdr) {
1121 vnet_hdr_len = sizeof(vnet_hdr);
1123 err = -EINVAL;
1124 if (len < vnet_hdr_len)
1125 goto out_unlock;
1127 len -= vnet_hdr_len;
1129 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1130 vnet_hdr_len);
1131 if (err < 0)
1132 goto out_unlock;
1134 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1135 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1136 vnet_hdr.hdr_len))
1137 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1138 vnet_hdr.csum_offset + 2;
1140 err = -EINVAL;
1141 if (vnet_hdr.hdr_len > len)
1142 goto out_unlock;
1144 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1145 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1146 case VIRTIO_NET_HDR_GSO_TCPV4:
1147 gso_type = SKB_GSO_TCPV4;
1148 break;
1149 case VIRTIO_NET_HDR_GSO_TCPV6:
1150 gso_type = SKB_GSO_TCPV6;
1151 break;
1152 case VIRTIO_NET_HDR_GSO_UDP:
1153 gso_type = SKB_GSO_UDP;
1154 break;
1155 default:
1156 goto out_unlock;
1159 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1160 gso_type |= SKB_GSO_TCP_ECN;
1162 if (vnet_hdr.gso_size == 0)
1163 goto out_unlock;
1168 err = -EMSGSIZE;
1169 if (!gso_type && (len > dev->mtu+reserve))
1170 goto out_unlock;
1172 err = -ENOBUFS;
1173 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1174 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1175 msg->msg_flags & MSG_DONTWAIT, &err);
1176 if (skb == NULL)
1177 goto out_unlock;
1179 skb_set_network_header(skb, reserve);
1181 err = -EINVAL;
1182 if (sock->type == SOCK_DGRAM &&
1183 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1184 goto out_free;
1186 /* Returns -EFAULT on error */
1187 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1188 if (err)
1189 goto out_free;
1191 skb->protocol = proto;
1192 skb->dev = dev;
1193 skb->priority = sk->sk_priority;
1194 skb->mark = sk->sk_mark;
1196 if (po->has_vnet_hdr) {
1197 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1198 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1199 vnet_hdr.csum_offset)) {
1200 err = -EINVAL;
1201 goto out_free;
1205 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1206 skb_shinfo(skb)->gso_type = gso_type;
1208 /* Header must be checked, and gso_segs computed. */
1209 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1210 skb_shinfo(skb)->gso_segs = 0;
1212 len += vnet_hdr_len;
1216 * Now send it
1219 err = dev_queue_xmit(skb);
1220 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1221 goto out_unlock;
1223 dev_put(dev);
1225 return len;
1227 out_free:
1228 kfree_skb(skb);
1229 out_unlock:
1230 if (dev)
1231 dev_put(dev);
1232 out:
1233 return err;
1236 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1237 struct msghdr *msg, size_t len)
1239 struct sock *sk = sock->sk;
1240 struct packet_sock *po = pkt_sk(sk);
1241 if (po->tx_ring.pg_vec)
1242 return tpacket_snd(po, msg);
1243 else
1244 return packet_snd(sock, msg, len);
1248 * Close a PACKET socket. This is fairly simple. We immediately go
1249 * to 'closed' state and remove our protocol entry in the device list.
1252 static int packet_release(struct socket *sock)
1254 struct sock *sk = sock->sk;
1255 struct packet_sock *po;
1256 struct net *net;
1257 struct tpacket_req req;
1259 if (!sk)
1260 return 0;
1262 net = sock_net(sk);
1263 po = pkt_sk(sk);
1265 spin_lock_bh(&net->packet.sklist_lock);
1266 sk_del_node_init_rcu(sk);
1267 sock_prot_inuse_add(net, sk->sk_prot, -1);
1268 spin_unlock_bh(&net->packet.sklist_lock);
1270 spin_lock(&po->bind_lock);
1271 if (po->running) {
1273 * Remove from protocol table
1275 po->running = 0;
1276 po->num = 0;
1277 __dev_remove_pack(&po->prot_hook);
1278 __sock_put(sk);
1280 spin_unlock(&po->bind_lock);
1282 packet_flush_mclist(sk);
1284 memset(&req, 0, sizeof(req));
1286 if (po->rx_ring.pg_vec)
1287 packet_set_ring(sk, &req, 1, 0);
1289 if (po->tx_ring.pg_vec)
1290 packet_set_ring(sk, &req, 1, 1);
1292 synchronize_net();
1294 * Now the socket is dead. No more input will appear.
1296 sock_orphan(sk);
1297 sock->sk = NULL;
1299 /* Purge queues */
1301 skb_queue_purge(&sk->sk_receive_queue);
1302 sk_refcnt_debug_release(sk);
1304 sock_put(sk);
1305 return 0;
1309 * Attach a packet hook.
1312 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1314 struct packet_sock *po = pkt_sk(sk);
1316 * Detach an existing hook if present.
1319 lock_sock(sk);
1321 spin_lock(&po->bind_lock);
1322 if (po->running) {
1323 __sock_put(sk);
1324 po->running = 0;
1325 po->num = 0;
1326 spin_unlock(&po->bind_lock);
1327 dev_remove_pack(&po->prot_hook);
1328 spin_lock(&po->bind_lock);
1331 po->num = protocol;
1332 po->prot_hook.type = protocol;
1333 po->prot_hook.dev = dev;
1335 po->ifindex = dev ? dev->ifindex : 0;
1337 if (protocol == 0)
1338 goto out_unlock;
1340 if (!dev || (dev->flags & IFF_UP)) {
1341 dev_add_pack(&po->prot_hook);
1342 sock_hold(sk);
1343 po->running = 1;
1344 } else {
1345 sk->sk_err = ENETDOWN;
1346 if (!sock_flag(sk, SOCK_DEAD))
1347 sk->sk_error_report(sk);
1350 out_unlock:
1351 spin_unlock(&po->bind_lock);
1352 release_sock(sk);
1353 return 0;
1357 * Bind a packet socket to a device
1360 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1361 int addr_len)
1363 struct sock *sk = sock->sk;
1364 char name[15];
1365 struct net_device *dev;
1366 int err = -ENODEV;
1369 * Check legality
1372 if (addr_len != sizeof(struct sockaddr))
1373 return -EINVAL;
1374 strlcpy(name, uaddr->sa_data, sizeof(name));
1376 dev = dev_get_by_name(sock_net(sk), name);
1377 if (dev) {
1378 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1379 dev_put(dev);
1381 return err;
1384 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1386 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1387 struct sock *sk = sock->sk;
1388 struct net_device *dev = NULL;
1389 int err;
1393 * Check legality
1396 if (addr_len < sizeof(struct sockaddr_ll))
1397 return -EINVAL;
1398 if (sll->sll_family != AF_PACKET)
1399 return -EINVAL;
1401 if (sll->sll_ifindex) {
1402 err = -ENODEV;
1403 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1404 if (dev == NULL)
1405 goto out;
1407 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1408 if (dev)
1409 dev_put(dev);
1411 out:
1412 return err;
1415 static struct proto packet_proto = {
1416 .name = "PACKET",
1417 .owner = THIS_MODULE,
1418 .obj_size = sizeof(struct packet_sock),
1422 * Create a packet of type SOCK_PACKET.
1425 static int packet_create(struct net *net, struct socket *sock, int protocol,
1426 int kern)
1428 struct sock *sk;
1429 struct packet_sock *po;
1430 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1431 int err;
1433 if (!capable(CAP_NET_RAW))
1434 return -EPERM;
1435 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1436 sock->type != SOCK_PACKET)
1437 return -ESOCKTNOSUPPORT;
1439 sock->state = SS_UNCONNECTED;
1441 err = -ENOBUFS;
1442 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1443 if (sk == NULL)
1444 goto out;
1446 sock->ops = &packet_ops;
1447 if (sock->type == SOCK_PACKET)
1448 sock->ops = &packet_ops_spkt;
1450 sock_init_data(sock, sk);
1452 po = pkt_sk(sk);
1453 sk->sk_family = PF_PACKET;
1454 po->num = proto;
1456 sk->sk_destruct = packet_sock_destruct;
1457 sk_refcnt_debug_inc(sk);
1460 * Attach a protocol block
1463 spin_lock_init(&po->bind_lock);
1464 mutex_init(&po->pg_vec_lock);
1465 po->prot_hook.func = packet_rcv;
1467 if (sock->type == SOCK_PACKET)
1468 po->prot_hook.func = packet_rcv_spkt;
1470 po->prot_hook.af_packet_priv = sk;
1472 if (proto) {
1473 po->prot_hook.type = proto;
1474 dev_add_pack(&po->prot_hook);
1475 sock_hold(sk);
1476 po->running = 1;
1479 spin_lock_bh(&net->packet.sklist_lock);
1480 sk_add_node_rcu(sk, &net->packet.sklist);
1481 sock_prot_inuse_add(net, &packet_proto, 1);
1482 spin_unlock_bh(&net->packet.sklist_lock);
1484 return 0;
1485 out:
1486 return err;
1490 * Pull a packet from our receive queue and hand it to the user.
1491 * If necessary we block.
1494 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1495 struct msghdr *msg, size_t len, int flags)
1497 struct sock *sk = sock->sk;
1498 struct sk_buff *skb;
1499 int copied, err;
1500 struct sockaddr_ll *sll;
1501 int vnet_hdr_len = 0;
1503 err = -EINVAL;
1504 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1505 goto out;
1507 #if 0
1508 /* What error should we return now? EUNATTACH? */
1509 if (pkt_sk(sk)->ifindex < 0)
1510 return -ENODEV;
1511 #endif
1514 * Call the generic datagram receiver. This handles all sorts
1515 * of horrible races and re-entrancy so we can forget about it
1516 * in the protocol layers.
1518 * Now it will return ENETDOWN, if device have just gone down,
1519 * but then it will block.
1522 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1525 * An error occurred so return it. Because skb_recv_datagram()
1526 * handles the blocking we don't see and worry about blocking
1527 * retries.
1530 if (skb == NULL)
1531 goto out;
1533 if (pkt_sk(sk)->has_vnet_hdr) {
1534 struct virtio_net_hdr vnet_hdr = { 0 };
1536 err = -EINVAL;
1537 vnet_hdr_len = sizeof(vnet_hdr);
1538 if ((len -= vnet_hdr_len) < 0)
1539 goto out_free;
1541 if (skb_is_gso(skb)) {
1542 struct skb_shared_info *sinfo = skb_shinfo(skb);
1544 /* This is a hint as to how much should be linear. */
1545 vnet_hdr.hdr_len = skb_headlen(skb);
1546 vnet_hdr.gso_size = sinfo->gso_size;
1547 if (sinfo->gso_type & SKB_GSO_TCPV4)
1548 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1549 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1550 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1551 else if (sinfo->gso_type & SKB_GSO_UDP)
1552 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1553 else if (sinfo->gso_type & SKB_GSO_FCOE)
1554 goto out_free;
1555 else
1556 BUG();
1557 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1558 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1559 } else
1560 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1562 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1563 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1564 vnet_hdr.csum_start = skb->csum_start -
1565 skb_headroom(skb);
1566 vnet_hdr.csum_offset = skb->csum_offset;
1567 } /* else everything is zero */
1569 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1570 vnet_hdr_len);
1571 if (err < 0)
1572 goto out_free;
1576 * If the address length field is there to be filled in, we fill
1577 * it in now.
1580 sll = &PACKET_SKB_CB(skb)->sa.ll;
1581 if (sock->type == SOCK_PACKET)
1582 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1583 else
1584 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1587 * You lose any data beyond the buffer you gave. If it worries a
1588 * user program they can ask the device for its MTU anyway.
1591 copied = skb->len;
1592 if (copied > len) {
1593 copied = len;
1594 msg->msg_flags |= MSG_TRUNC;
1597 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1598 if (err)
1599 goto out_free;
1601 sock_recv_ts_and_drops(msg, sk, skb);
1603 if (msg->msg_name)
1604 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1605 msg->msg_namelen);
1607 if (pkt_sk(sk)->auxdata) {
1608 struct tpacket_auxdata aux;
1610 aux.tp_status = TP_STATUS_USER;
1611 if (skb->ip_summed == CHECKSUM_PARTIAL)
1612 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1613 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1614 aux.tp_snaplen = skb->len;
1615 aux.tp_mac = 0;
1616 aux.tp_net = skb_network_offset(skb);
1617 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1619 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1623 * Free or return the buffer as appropriate. Again this
1624 * hides all the races and re-entrancy issues from us.
1626 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1628 out_free:
1629 skb_free_datagram(sk, skb);
1630 out:
1631 return err;
1634 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1635 int *uaddr_len, int peer)
1637 struct net_device *dev;
1638 struct sock *sk = sock->sk;
1640 if (peer)
1641 return -EOPNOTSUPP;
1643 uaddr->sa_family = AF_PACKET;
1644 rcu_read_lock();
1645 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1646 if (dev)
1647 strlcpy(uaddr->sa_data, dev->name, 15);
1648 else
1649 memset(uaddr->sa_data, 0, 14);
1650 rcu_read_unlock();
1651 *uaddr_len = sizeof(*uaddr);
1653 return 0;
1656 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1657 int *uaddr_len, int peer)
1659 struct net_device *dev;
1660 struct sock *sk = sock->sk;
1661 struct packet_sock *po = pkt_sk(sk);
1662 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1664 if (peer)
1665 return -EOPNOTSUPP;
1667 sll->sll_family = AF_PACKET;
1668 sll->sll_ifindex = po->ifindex;
1669 sll->sll_protocol = po->num;
1670 rcu_read_lock();
1671 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1672 if (dev) {
1673 sll->sll_hatype = dev->type;
1674 sll->sll_halen = dev->addr_len;
1675 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1676 } else {
1677 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1678 sll->sll_halen = 0;
1680 rcu_read_unlock();
1681 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1683 return 0;
1686 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1687 int what)
1689 switch (i->type) {
1690 case PACKET_MR_MULTICAST:
1691 if (what > 0)
1692 return dev_mc_add(dev, i->addr, i->alen, 0);
1693 else
1694 return dev_mc_delete(dev, i->addr, i->alen, 0);
1695 break;
1696 case PACKET_MR_PROMISC:
1697 return dev_set_promiscuity(dev, what);
1698 break;
1699 case PACKET_MR_ALLMULTI:
1700 return dev_set_allmulti(dev, what);
1701 break;
1702 case PACKET_MR_UNICAST:
1703 if (what > 0)
1704 return dev_unicast_add(dev, i->addr);
1705 else
1706 return dev_unicast_delete(dev, i->addr);
1707 break;
1708 default:
1709 break;
1711 return 0;
1714 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1716 for ( ; i; i = i->next) {
1717 if (i->ifindex == dev->ifindex)
1718 packet_dev_mc(dev, i, what);
1722 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1724 struct packet_sock *po = pkt_sk(sk);
1725 struct packet_mclist *ml, *i;
1726 struct net_device *dev;
1727 int err;
1729 rtnl_lock();
1731 err = -ENODEV;
1732 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1733 if (!dev)
1734 goto done;
1736 err = -EINVAL;
1737 if (mreq->mr_alen != dev->addr_len)
1738 goto done;
1740 err = -ENOBUFS;
1741 i = kmalloc(sizeof(*i), GFP_KERNEL);
1742 if (i == NULL)
1743 goto done;
1745 err = 0;
1746 for (ml = po->mclist; ml; ml = ml->next) {
1747 if (ml->ifindex == mreq->mr_ifindex &&
1748 ml->type == mreq->mr_type &&
1749 ml->alen == mreq->mr_alen &&
1750 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1751 ml->count++;
1752 /* Free the new element ... */
1753 kfree(i);
1754 goto done;
1758 i->type = mreq->mr_type;
1759 i->ifindex = mreq->mr_ifindex;
1760 i->alen = mreq->mr_alen;
1761 memcpy(i->addr, mreq->mr_address, i->alen);
1762 i->count = 1;
1763 i->next = po->mclist;
1764 po->mclist = i;
1765 err = packet_dev_mc(dev, i, 1);
1766 if (err) {
1767 po->mclist = i->next;
1768 kfree(i);
1771 done:
1772 rtnl_unlock();
1773 return err;
1776 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1778 struct packet_mclist *ml, **mlp;
1780 rtnl_lock();
1782 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1783 if (ml->ifindex == mreq->mr_ifindex &&
1784 ml->type == mreq->mr_type &&
1785 ml->alen == mreq->mr_alen &&
1786 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1787 if (--ml->count == 0) {
1788 struct net_device *dev;
1789 *mlp = ml->next;
1790 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1791 if (dev)
1792 packet_dev_mc(dev, ml, -1);
1793 kfree(ml);
1795 rtnl_unlock();
1796 return 0;
1799 rtnl_unlock();
1800 return -EADDRNOTAVAIL;
1803 static void packet_flush_mclist(struct sock *sk)
1805 struct packet_sock *po = pkt_sk(sk);
1806 struct packet_mclist *ml;
1808 if (!po->mclist)
1809 return;
1811 rtnl_lock();
1812 while ((ml = po->mclist) != NULL) {
1813 struct net_device *dev;
1815 po->mclist = ml->next;
1816 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1817 if (dev != NULL)
1818 packet_dev_mc(dev, ml, -1);
1819 kfree(ml);
1821 rtnl_unlock();
1824 static int
1825 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1827 struct sock *sk = sock->sk;
1828 struct packet_sock *po = pkt_sk(sk);
1829 int ret;
1831 if (level != SOL_PACKET)
1832 return -ENOPROTOOPT;
1834 switch (optname) {
1835 case PACKET_ADD_MEMBERSHIP:
1836 case PACKET_DROP_MEMBERSHIP:
1838 struct packet_mreq_max mreq;
1839 int len = optlen;
1840 memset(&mreq, 0, sizeof(mreq));
1841 if (len < sizeof(struct packet_mreq))
1842 return -EINVAL;
1843 if (len > sizeof(mreq))
1844 len = sizeof(mreq);
1845 if (copy_from_user(&mreq, optval, len))
1846 return -EFAULT;
1847 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1848 return -EINVAL;
1849 if (optname == PACKET_ADD_MEMBERSHIP)
1850 ret = packet_mc_add(sk, &mreq);
1851 else
1852 ret = packet_mc_drop(sk, &mreq);
1853 return ret;
1856 case PACKET_RX_RING:
1857 case PACKET_TX_RING:
1859 struct tpacket_req req;
1861 if (optlen < sizeof(req))
1862 return -EINVAL;
1863 if (pkt_sk(sk)->has_vnet_hdr)
1864 return -EINVAL;
1865 if (copy_from_user(&req, optval, sizeof(req)))
1866 return -EFAULT;
1867 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1869 case PACKET_COPY_THRESH:
1871 int val;
1873 if (optlen != sizeof(val))
1874 return -EINVAL;
1875 if (copy_from_user(&val, optval, sizeof(val)))
1876 return -EFAULT;
1878 pkt_sk(sk)->copy_thresh = val;
1879 return 0;
1881 case PACKET_VERSION:
1883 int val;
1885 if (optlen != sizeof(val))
1886 return -EINVAL;
1887 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1888 return -EBUSY;
1889 if (copy_from_user(&val, optval, sizeof(val)))
1890 return -EFAULT;
1891 switch (val) {
1892 case TPACKET_V1:
1893 case TPACKET_V2:
1894 po->tp_version = val;
1895 return 0;
1896 default:
1897 return -EINVAL;
1900 case PACKET_RESERVE:
1902 unsigned int val;
1904 if (optlen != sizeof(val))
1905 return -EINVAL;
1906 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1907 return -EBUSY;
1908 if (copy_from_user(&val, optval, sizeof(val)))
1909 return -EFAULT;
1910 po->tp_reserve = val;
1911 return 0;
1913 case PACKET_LOSS:
1915 unsigned int val;
1917 if (optlen != sizeof(val))
1918 return -EINVAL;
1919 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1920 return -EBUSY;
1921 if (copy_from_user(&val, optval, sizeof(val)))
1922 return -EFAULT;
1923 po->tp_loss = !!val;
1924 return 0;
1926 case PACKET_AUXDATA:
1928 int val;
1930 if (optlen < sizeof(val))
1931 return -EINVAL;
1932 if (copy_from_user(&val, optval, sizeof(val)))
1933 return -EFAULT;
1935 po->auxdata = !!val;
1936 return 0;
1938 case PACKET_ORIGDEV:
1940 int val;
1942 if (optlen < sizeof(val))
1943 return -EINVAL;
1944 if (copy_from_user(&val, optval, sizeof(val)))
1945 return -EFAULT;
1947 po->origdev = !!val;
1948 return 0;
1950 case PACKET_VNET_HDR:
1952 int val;
1954 if (sock->type != SOCK_RAW)
1955 return -EINVAL;
1956 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1957 return -EBUSY;
1958 if (optlen < sizeof(val))
1959 return -EINVAL;
1960 if (copy_from_user(&val, optval, sizeof(val)))
1961 return -EFAULT;
1963 po->has_vnet_hdr = !!val;
1964 return 0;
1966 default:
1967 return -ENOPROTOOPT;
1971 static int packet_getsockopt(struct socket *sock, int level, int optname,
1972 char __user *optval, int __user *optlen)
1974 int len;
1975 int val;
1976 struct sock *sk = sock->sk;
1977 struct packet_sock *po = pkt_sk(sk);
1978 void *data;
1979 struct tpacket_stats st;
1981 if (level != SOL_PACKET)
1982 return -ENOPROTOOPT;
1984 if (get_user(len, optlen))
1985 return -EFAULT;
1987 if (len < 0)
1988 return -EINVAL;
1990 switch (optname) {
1991 case PACKET_STATISTICS:
1992 if (len > sizeof(struct tpacket_stats))
1993 len = sizeof(struct tpacket_stats);
1994 spin_lock_bh(&sk->sk_receive_queue.lock);
1995 st = po->stats;
1996 memset(&po->stats, 0, sizeof(st));
1997 spin_unlock_bh(&sk->sk_receive_queue.lock);
1998 st.tp_packets += st.tp_drops;
2000 data = &st;
2001 break;
2002 case PACKET_AUXDATA:
2003 if (len > sizeof(int))
2004 len = sizeof(int);
2005 val = po->auxdata;
2007 data = &val;
2008 break;
2009 case PACKET_ORIGDEV:
2010 if (len > sizeof(int))
2011 len = sizeof(int);
2012 val = po->origdev;
2014 data = &val;
2015 break;
2016 case PACKET_VNET_HDR:
2017 if (len > sizeof(int))
2018 len = sizeof(int);
2019 val = po->has_vnet_hdr;
2021 data = &val;
2022 break;
2023 case PACKET_VERSION:
2024 if (len > sizeof(int))
2025 len = sizeof(int);
2026 val = po->tp_version;
2027 data = &val;
2028 break;
2029 case PACKET_HDRLEN:
2030 if (len > sizeof(int))
2031 len = sizeof(int);
2032 if (copy_from_user(&val, optval, len))
2033 return -EFAULT;
2034 switch (val) {
2035 case TPACKET_V1:
2036 val = sizeof(struct tpacket_hdr);
2037 break;
2038 case TPACKET_V2:
2039 val = sizeof(struct tpacket2_hdr);
2040 break;
2041 default:
2042 return -EINVAL;
2044 data = &val;
2045 break;
2046 case PACKET_RESERVE:
2047 if (len > sizeof(unsigned int))
2048 len = sizeof(unsigned int);
2049 val = po->tp_reserve;
2050 data = &val;
2051 break;
2052 case PACKET_LOSS:
2053 if (len > sizeof(unsigned int))
2054 len = sizeof(unsigned int);
2055 val = po->tp_loss;
2056 data = &val;
2057 break;
2058 default:
2059 return -ENOPROTOOPT;
2062 if (put_user(len, optlen))
2063 return -EFAULT;
2064 if (copy_to_user(optval, data, len))
2065 return -EFAULT;
2066 return 0;
2070 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2072 struct sock *sk;
2073 struct hlist_node *node;
2074 struct net_device *dev = data;
2075 struct net *net = dev_net(dev);
2077 rcu_read_lock();
2078 sk_for_each_rcu(sk, node, &net->packet.sklist) {
2079 struct packet_sock *po = pkt_sk(sk);
2081 switch (msg) {
2082 case NETDEV_UNREGISTER:
2083 if (po->mclist)
2084 packet_dev_mclist(dev, po->mclist, -1);
2085 /* fallthrough */
2087 case NETDEV_DOWN:
2088 if (dev->ifindex == po->ifindex) {
2089 spin_lock(&po->bind_lock);
2090 if (po->running) {
2091 __dev_remove_pack(&po->prot_hook);
2092 __sock_put(sk);
2093 po->running = 0;
2094 sk->sk_err = ENETDOWN;
2095 if (!sock_flag(sk, SOCK_DEAD))
2096 sk->sk_error_report(sk);
2098 if (msg == NETDEV_UNREGISTER) {
2099 po->ifindex = -1;
2100 po->prot_hook.dev = NULL;
2102 spin_unlock(&po->bind_lock);
2104 break;
2105 case NETDEV_UP:
2106 if (dev->ifindex == po->ifindex) {
2107 spin_lock(&po->bind_lock);
2108 if (po->num && !po->running) {
2109 dev_add_pack(&po->prot_hook);
2110 sock_hold(sk);
2111 po->running = 1;
2113 spin_unlock(&po->bind_lock);
2115 break;
2118 rcu_read_unlock();
2119 return NOTIFY_DONE;
2123 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2124 unsigned long arg)
2126 struct sock *sk = sock->sk;
2128 switch (cmd) {
2129 case SIOCOUTQ:
2131 int amount = sk_wmem_alloc_get(sk);
2133 return put_user(amount, (int __user *)arg);
2135 case SIOCINQ:
2137 struct sk_buff *skb;
2138 int amount = 0;
2140 spin_lock_bh(&sk->sk_receive_queue.lock);
2141 skb = skb_peek(&sk->sk_receive_queue);
2142 if (skb)
2143 amount = skb->len;
2144 spin_unlock_bh(&sk->sk_receive_queue.lock);
2145 return put_user(amount, (int __user *)arg);
2147 case SIOCGSTAMP:
2148 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2149 case SIOCGSTAMPNS:
2150 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2152 #ifdef CONFIG_INET
2153 case SIOCADDRT:
2154 case SIOCDELRT:
2155 case SIOCDARP:
2156 case SIOCGARP:
2157 case SIOCSARP:
2158 case SIOCGIFADDR:
2159 case SIOCSIFADDR:
2160 case SIOCGIFBRDADDR:
2161 case SIOCSIFBRDADDR:
2162 case SIOCGIFNETMASK:
2163 case SIOCSIFNETMASK:
2164 case SIOCGIFDSTADDR:
2165 case SIOCSIFDSTADDR:
2166 case SIOCSIFFLAGS:
2167 if (!net_eq(sock_net(sk), &init_net))
2168 return -ENOIOCTLCMD;
2169 return inet_dgram_ops.ioctl(sock, cmd, arg);
2170 #endif
2172 default:
2173 return -ENOIOCTLCMD;
2175 return 0;
2178 static unsigned int packet_poll(struct file *file, struct socket *sock,
2179 poll_table *wait)
2181 struct sock *sk = sock->sk;
2182 struct packet_sock *po = pkt_sk(sk);
2183 unsigned int mask = datagram_poll(file, sock, wait);
2185 spin_lock_bh(&sk->sk_receive_queue.lock);
2186 if (po->rx_ring.pg_vec) {
2187 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2188 mask |= POLLIN | POLLRDNORM;
2190 spin_unlock_bh(&sk->sk_receive_queue.lock);
2191 spin_lock_bh(&sk->sk_write_queue.lock);
2192 if (po->tx_ring.pg_vec) {
2193 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2194 mask |= POLLOUT | POLLWRNORM;
2196 spin_unlock_bh(&sk->sk_write_queue.lock);
2197 return mask;
2201 /* Dirty? Well, I still did not learn better way to account
2202 * for user mmaps.
2205 static void packet_mm_open(struct vm_area_struct *vma)
2207 struct file *file = vma->vm_file;
2208 struct socket *sock = file->private_data;
2209 struct sock *sk = sock->sk;
2211 if (sk)
2212 atomic_inc(&pkt_sk(sk)->mapped);
2215 static void packet_mm_close(struct vm_area_struct *vma)
2217 struct file *file = vma->vm_file;
2218 struct socket *sock = file->private_data;
2219 struct sock *sk = sock->sk;
2221 if (sk)
2222 atomic_dec(&pkt_sk(sk)->mapped);
2225 static const struct vm_operations_struct packet_mmap_ops = {
2226 .open = packet_mm_open,
2227 .close = packet_mm_close,
2230 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2232 int i;
2234 for (i = 0; i < len; i++) {
2235 if (likely(pg_vec[i]))
2236 free_pages((unsigned long) pg_vec[i], order);
2238 kfree(pg_vec);
2241 static inline char *alloc_one_pg_vec_page(unsigned long order)
2243 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2245 return (char *) __get_free_pages(gfp_flags, order);
2248 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2250 unsigned int block_nr = req->tp_block_nr;
2251 char **pg_vec;
2252 int i;
2254 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2255 if (unlikely(!pg_vec))
2256 goto out;
2258 for (i = 0; i < block_nr; i++) {
2259 pg_vec[i] = alloc_one_pg_vec_page(order);
2260 if (unlikely(!pg_vec[i]))
2261 goto out_free_pgvec;
2264 out:
2265 return pg_vec;
2267 out_free_pgvec:
2268 free_pg_vec(pg_vec, order, block_nr);
2269 pg_vec = NULL;
2270 goto out;
2273 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2274 int closing, int tx_ring)
2276 char **pg_vec = NULL;
2277 struct packet_sock *po = pkt_sk(sk);
2278 int was_running, order = 0;
2279 struct packet_ring_buffer *rb;
2280 struct sk_buff_head *rb_queue;
2281 __be16 num;
2282 int err;
2284 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2285 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2287 err = -EBUSY;
2288 if (!closing) {
2289 if (atomic_read(&po->mapped))
2290 goto out;
2291 if (atomic_read(&rb->pending))
2292 goto out;
2295 if (req->tp_block_nr) {
2296 /* Sanity tests and some calculations */
2297 err = -EBUSY;
2298 if (unlikely(rb->pg_vec))
2299 goto out;
2301 switch (po->tp_version) {
2302 case TPACKET_V1:
2303 po->tp_hdrlen = TPACKET_HDRLEN;
2304 break;
2305 case TPACKET_V2:
2306 po->tp_hdrlen = TPACKET2_HDRLEN;
2307 break;
2310 err = -EINVAL;
2311 if (unlikely((int)req->tp_block_size <= 0))
2312 goto out;
2313 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2314 goto out;
2315 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2316 po->tp_reserve))
2317 goto out;
2318 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2319 goto out;
2321 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2322 if (unlikely(rb->frames_per_block <= 0))
2323 goto out;
2324 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2325 req->tp_frame_nr))
2326 goto out;
2328 err = -ENOMEM;
2329 order = get_order(req->tp_block_size);
2330 pg_vec = alloc_pg_vec(req, order);
2331 if (unlikely(!pg_vec))
2332 goto out;
2334 /* Done */
2335 else {
2336 err = -EINVAL;
2337 if (unlikely(req->tp_frame_nr))
2338 goto out;
2341 lock_sock(sk);
2343 /* Detach socket from network */
2344 spin_lock(&po->bind_lock);
2345 was_running = po->running;
2346 num = po->num;
2347 if (was_running) {
2348 __dev_remove_pack(&po->prot_hook);
2349 po->num = 0;
2350 po->running = 0;
2351 __sock_put(sk);
2353 spin_unlock(&po->bind_lock);
2355 synchronize_net();
2357 err = -EBUSY;
2358 mutex_lock(&po->pg_vec_lock);
2359 if (closing || atomic_read(&po->mapped) == 0) {
2360 err = 0;
2361 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2362 spin_lock_bh(&rb_queue->lock);
2363 pg_vec = XC(rb->pg_vec, pg_vec);
2364 rb->frame_max = (req->tp_frame_nr - 1);
2365 rb->head = 0;
2366 rb->frame_size = req->tp_frame_size;
2367 spin_unlock_bh(&rb_queue->lock);
2369 order = XC(rb->pg_vec_order, order);
2370 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2372 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2373 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2374 tpacket_rcv : packet_rcv;
2375 skb_queue_purge(rb_queue);
2376 #undef XC
2377 if (atomic_read(&po->mapped))
2378 pr_err("packet_mmap: vma is busy: %d\n",
2379 atomic_read(&po->mapped));
2381 mutex_unlock(&po->pg_vec_lock);
2383 spin_lock(&po->bind_lock);
2384 if (was_running && !po->running) {
2385 sock_hold(sk);
2386 po->running = 1;
2387 po->num = num;
2388 dev_add_pack(&po->prot_hook);
2390 spin_unlock(&po->bind_lock);
2392 release_sock(sk);
2394 if (pg_vec)
2395 free_pg_vec(pg_vec, order, req->tp_block_nr);
2396 out:
2397 return err;
2400 static int packet_mmap(struct file *file, struct socket *sock,
2401 struct vm_area_struct *vma)
2403 struct sock *sk = sock->sk;
2404 struct packet_sock *po = pkt_sk(sk);
2405 unsigned long size, expected_size;
2406 struct packet_ring_buffer *rb;
2407 unsigned long start;
2408 int err = -EINVAL;
2409 int i;
2411 if (vma->vm_pgoff)
2412 return -EINVAL;
2414 mutex_lock(&po->pg_vec_lock);
2416 expected_size = 0;
2417 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2418 if (rb->pg_vec) {
2419 expected_size += rb->pg_vec_len
2420 * rb->pg_vec_pages
2421 * PAGE_SIZE;
2425 if (expected_size == 0)
2426 goto out;
2428 size = vma->vm_end - vma->vm_start;
2429 if (size != expected_size)
2430 goto out;
2432 start = vma->vm_start;
2433 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2434 if (rb->pg_vec == NULL)
2435 continue;
2437 for (i = 0; i < rb->pg_vec_len; i++) {
2438 struct page *page = virt_to_page(rb->pg_vec[i]);
2439 int pg_num;
2441 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2442 pg_num++, page++) {
2443 err = vm_insert_page(vma, start, page);
2444 if (unlikely(err))
2445 goto out;
2446 start += PAGE_SIZE;
2451 atomic_inc(&po->mapped);
2452 vma->vm_ops = &packet_mmap_ops;
2453 err = 0;
2455 out:
2456 mutex_unlock(&po->pg_vec_lock);
2457 return err;
2460 static const struct proto_ops packet_ops_spkt = {
2461 .family = PF_PACKET,
2462 .owner = THIS_MODULE,
2463 .release = packet_release,
2464 .bind = packet_bind_spkt,
2465 .connect = sock_no_connect,
2466 .socketpair = sock_no_socketpair,
2467 .accept = sock_no_accept,
2468 .getname = packet_getname_spkt,
2469 .poll = datagram_poll,
2470 .ioctl = packet_ioctl,
2471 .listen = sock_no_listen,
2472 .shutdown = sock_no_shutdown,
2473 .setsockopt = sock_no_setsockopt,
2474 .getsockopt = sock_no_getsockopt,
2475 .sendmsg = packet_sendmsg_spkt,
2476 .recvmsg = packet_recvmsg,
2477 .mmap = sock_no_mmap,
2478 .sendpage = sock_no_sendpage,
2481 static const struct proto_ops packet_ops = {
2482 .family = PF_PACKET,
2483 .owner = THIS_MODULE,
2484 .release = packet_release,
2485 .bind = packet_bind,
2486 .connect = sock_no_connect,
2487 .socketpair = sock_no_socketpair,
2488 .accept = sock_no_accept,
2489 .getname = packet_getname,
2490 .poll = packet_poll,
2491 .ioctl = packet_ioctl,
2492 .listen = sock_no_listen,
2493 .shutdown = sock_no_shutdown,
2494 .setsockopt = packet_setsockopt,
2495 .getsockopt = packet_getsockopt,
2496 .sendmsg = packet_sendmsg,
2497 .recvmsg = packet_recvmsg,
2498 .mmap = packet_mmap,
2499 .sendpage = sock_no_sendpage,
2502 static const struct net_proto_family packet_family_ops = {
2503 .family = PF_PACKET,
2504 .create = packet_create,
2505 .owner = THIS_MODULE,
2508 static struct notifier_block packet_netdev_notifier = {
2509 .notifier_call = packet_notifier,
2512 #ifdef CONFIG_PROC_FS
2514 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2515 __acquires(RCU)
2517 struct net *net = seq_file_net(seq);
2519 rcu_read_lock();
2520 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2523 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2525 struct net *net = seq_file_net(seq);
2526 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2529 static void packet_seq_stop(struct seq_file *seq, void *v)
2530 __releases(RCU)
2532 rcu_read_unlock();
2535 static int packet_seq_show(struct seq_file *seq, void *v)
2537 if (v == SEQ_START_TOKEN)
2538 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2539 else {
2540 struct sock *s = sk_entry(v);
2541 const struct packet_sock *po = pkt_sk(s);
2543 seq_printf(seq,
2544 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2546 atomic_read(&s->sk_refcnt),
2547 s->sk_type,
2548 ntohs(po->num),
2549 po->ifindex,
2550 po->running,
2551 atomic_read(&s->sk_rmem_alloc),
2552 sock_i_uid(s),
2553 sock_i_ino(s));
2556 return 0;
2559 static const struct seq_operations packet_seq_ops = {
2560 .start = packet_seq_start,
2561 .next = packet_seq_next,
2562 .stop = packet_seq_stop,
2563 .show = packet_seq_show,
2566 static int packet_seq_open(struct inode *inode, struct file *file)
2568 return seq_open_net(inode, file, &packet_seq_ops,
2569 sizeof(struct seq_net_private));
2572 static const struct file_operations packet_seq_fops = {
2573 .owner = THIS_MODULE,
2574 .open = packet_seq_open,
2575 .read = seq_read,
2576 .llseek = seq_lseek,
2577 .release = seq_release_net,
2580 #endif
2582 static int __net_init packet_net_init(struct net *net)
2584 spin_lock_init(&net->packet.sklist_lock);
2585 INIT_HLIST_HEAD(&net->packet.sklist);
2587 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2588 return -ENOMEM;
2590 return 0;
2593 static void __net_exit packet_net_exit(struct net *net)
2595 proc_net_remove(net, "packet");
2598 static struct pernet_operations packet_net_ops = {
2599 .init = packet_net_init,
2600 .exit = packet_net_exit,
2604 static void __exit packet_exit(void)
2606 unregister_netdevice_notifier(&packet_netdev_notifier);
2607 unregister_pernet_subsys(&packet_net_ops);
2608 sock_unregister(PF_PACKET);
2609 proto_unregister(&packet_proto);
2612 static int __init packet_init(void)
2614 int rc = proto_register(&packet_proto, 0);
2616 if (rc != 0)
2617 goto out;
2619 sock_register(&packet_family_ops);
2620 register_pernet_subsys(&packet_net_ops);
2621 register_netdevice_notifier(&packet_netdev_notifier);
2622 out:
2623 return rc;
2626 module_init(packet_init);
2627 module_exit(packet_exit);
2628 MODULE_LICENSE("GPL");
2629 MODULE_ALIAS_NETPROTO(PF_PACKET);