2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
35 #include <net/protocol.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44 #include <net/rtnetlink.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
56 1. The most important issue is detecting local dead loops.
57 They would cause complete host lockup in transmit, which
58 would be "resolved" by stack overflow or, if queueing is enabled,
59 with infinite looping in net_bh.
61 We cannot track such dead loops during route installation,
62 it is infeasible task. The most general solutions would be
63 to keep skb->encapsulation counter (sort of local ttl),
64 and silently drop packet when it expires. It is the best
65 solution, but it supposes maintaing new variable in ALL
66 skb, even if no tunneling is used.
68 Current solution: t->recursion lock breaks dead loops. It looks
69 like dev->tbusy flag, but I preferred new variable, because
70 the semantics is different. One day, when hard_start_xmit
71 will be multithreaded we will have to use skb->encapsulation.
75 2. Networking dead loops would not kill routers, but would really
76 kill network. IP hop limit plays role of "t->recursion" in this case,
77 if we copy it from packet being encapsulated to upper header.
78 It is very good solution, but it introduces two problems:
80 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81 do not work over tunnels.
82 - traceroute does not work. I planned to relay ICMP from tunnel,
83 so that this problem would be solved and traceroute output
84 would even more informative. This idea appeared to be wrong:
85 only Linux complies to rfc1812 now (yes, guys, Linux is the only
86 true router now :-)), all routers (at least, in neighbourhood of mine)
87 return only 8 bytes of payload. It is the end.
89 Hence, if we want that OSPF worked or traceroute said something reasonable,
90 we should search for another solution.
92 One of them is to parse packet trying to detect inner encapsulation
93 made by our node. It is difficult or even impossible, especially,
94 taking into account fragmentation. TO be short, tt is not solution at all.
96 Current solution: The solution was UNEXPECTEDLY SIMPLE.
97 We force DF flag on tunnels with preconfigured hop limit,
98 that is ALL. :-) Well, it does not remove the problem completely,
99 but exponential growth of network traffic is changed to linear
100 (branches, that exceed pmtu are pruned) and tunnel mtu
101 fastly degrades to value <68, where looping stops.
102 Yes, it is not good if there exists a router in the loop,
103 which does not force DF, even when encapsulating packets have DF set.
104 But it is not our problem! Nobody could accuse us, we made
105 all that we could make. Even if it is your gated who injected
106 fatal route to network, even if it were you who configured
107 fatal static route: you are innocent. :-)
111 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112 practically identical code. It would be good to glue them
113 together, but it is not very evident, how to make them modular.
114 sit is integral part of IPv6, ipip and gre are naturally modular.
115 We could extract common parts (hash table, ioctl etc)
116 to a separate module (ip_tunnel.c).
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly
;
122 static int ipgre_tunnel_init(struct net_device
*dev
);
123 static void ipgre_tunnel_setup(struct net_device
*dev
);
124 static int ipgre_tunnel_bind_dev(struct net_device
*dev
);
126 /* Fallback tunnel: no source, no destination, no key, no options */
128 static int ipgre_fb_tunnel_init(struct net_device
*dev
);
132 static int ipgre_net_id
;
134 struct ip_tunnel
*tunnels
[4][HASH_SIZE
];
136 struct net_device
*fb_tunnel_dev
;
139 /* Tunnel hash table */
149 We require exact key match i.e. if a key is present in packet
150 it will match only tunnel with the same key; if it is not present,
151 it will match only keyless tunnel.
153 All keysless packets, if not matched configured keyless tunnels
154 will match fallback tunnel.
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
159 #define tunnels_r_l tunnels[3]
160 #define tunnels_r tunnels[2]
161 #define tunnels_l tunnels[1]
162 #define tunnels_wc tunnels[0]
164 static DEFINE_RWLOCK(ipgre_lock
);
166 /* Given src, dst and key, find appropriate for input tunnel. */
168 static struct ip_tunnel
* ipgre_tunnel_lookup(struct net
*net
,
169 __be32 remote
, __be32 local
, __be32 key
)
171 unsigned h0
= HASH(remote
);
172 unsigned h1
= HASH(key
);
174 struct ipgre_net
*ign
= net_generic(net
, ipgre_net_id
);
176 for (t
= ign
->tunnels_r_l
[h0
^h1
]; t
; t
= t
->next
) {
177 if (local
== t
->parms
.iph
.saddr
&& remote
== t
->parms
.iph
.daddr
) {
178 if (t
->parms
.i_key
== key
&& (t
->dev
->flags
&IFF_UP
))
182 for (t
= ign
->tunnels_r
[h0
^h1
]; t
; t
= t
->next
) {
183 if (remote
== t
->parms
.iph
.daddr
) {
184 if (t
->parms
.i_key
== key
&& (t
->dev
->flags
&IFF_UP
))
188 for (t
= ign
->tunnels_l
[h1
]; t
; t
= t
->next
) {
189 if (local
== t
->parms
.iph
.saddr
||
190 (local
== t
->parms
.iph
.daddr
&&
191 ipv4_is_multicast(local
))) {
192 if (t
->parms
.i_key
== key
&& (t
->dev
->flags
&IFF_UP
))
196 for (t
= ign
->tunnels_wc
[h1
]; t
; t
= t
->next
) {
197 if (t
->parms
.i_key
== key
&& (t
->dev
->flags
&IFF_UP
))
201 if (ign
->fb_tunnel_dev
->flags
&IFF_UP
)
202 return netdev_priv(ign
->fb_tunnel_dev
);
206 static struct ip_tunnel
**__ipgre_bucket(struct ipgre_net
*ign
,
207 struct ip_tunnel_parm
*parms
)
209 __be32 remote
= parms
->iph
.daddr
;
210 __be32 local
= parms
->iph
.saddr
;
211 __be32 key
= parms
->i_key
;
212 unsigned h
= HASH(key
);
217 if (remote
&& !ipv4_is_multicast(remote
)) {
222 return &ign
->tunnels
[prio
][h
];
225 static inline struct ip_tunnel
**ipgre_bucket(struct ipgre_net
*ign
,
228 return __ipgre_bucket(ign
, &t
->parms
);
231 static void ipgre_tunnel_link(struct ipgre_net
*ign
, struct ip_tunnel
*t
)
233 struct ip_tunnel
**tp
= ipgre_bucket(ign
, t
);
236 write_lock_bh(&ipgre_lock
);
238 write_unlock_bh(&ipgre_lock
);
241 static void ipgre_tunnel_unlink(struct ipgre_net
*ign
, struct ip_tunnel
*t
)
243 struct ip_tunnel
**tp
;
245 for (tp
= ipgre_bucket(ign
, t
); *tp
; tp
= &(*tp
)->next
) {
247 write_lock_bh(&ipgre_lock
);
249 write_unlock_bh(&ipgre_lock
);
255 static struct ip_tunnel
* ipgre_tunnel_locate(struct net
*net
,
256 struct ip_tunnel_parm
*parms
, int create
)
258 __be32 remote
= parms
->iph
.daddr
;
259 __be32 local
= parms
->iph
.saddr
;
260 __be32 key
= parms
->i_key
;
261 struct ip_tunnel
*t
, **tp
, *nt
;
262 struct net_device
*dev
;
264 struct ipgre_net
*ign
= net_generic(net
, ipgre_net_id
);
266 for (tp
= __ipgre_bucket(ign
, parms
); (t
= *tp
) != NULL
; tp
= &t
->next
) {
267 if (local
== t
->parms
.iph
.saddr
&& remote
== t
->parms
.iph
.daddr
) {
268 if (key
== t
->parms
.i_key
)
276 strlcpy(name
, parms
->name
, IFNAMSIZ
);
278 sprintf(name
, "gre%%d");
280 dev
= alloc_netdev(sizeof(*t
), name
, ipgre_tunnel_setup
);
284 dev_net_set(dev
, net
);
286 if (strchr(name
, '%')) {
287 if (dev_alloc_name(dev
, name
) < 0)
291 nt
= netdev_priv(dev
);
293 dev
->rtnl_link_ops
= &ipgre_link_ops
;
295 dev
->mtu
= ipgre_tunnel_bind_dev(dev
);
297 if (register_netdevice(dev
) < 0)
301 ipgre_tunnel_link(ign
, nt
);
309 static void ipgre_tunnel_uninit(struct net_device
*dev
)
311 struct net
*net
= dev_net(dev
);
312 struct ipgre_net
*ign
= net_generic(net
, ipgre_net_id
);
314 ipgre_tunnel_unlink(ign
, netdev_priv(dev
));
319 static void ipgre_err(struct sk_buff
*skb
, u32 info
)
322 /* All the routers (except for Linux) return only
323 8 bytes of packet payload. It means, that precise relaying of
324 ICMP in the real Internet is absolutely infeasible.
326 Moreover, Cisco "wise men" put GRE key to the third word
327 in GRE header. It makes impossible maintaining even soft state for keyed
328 GRE tunnels with enabled checksum. Tell them "thank you".
330 Well, I wonder, rfc1812 was written by Cisco employee,
331 what the hell these idiots break standrads established
335 struct iphdr
*iph
= (struct iphdr
*)skb
->data
;
336 __be16
*p
= (__be16
*)(skb
->data
+(iph
->ihl
<<2));
337 int grehlen
= (iph
->ihl
<<2) + 4;
338 const int type
= icmp_hdr(skb
)->type
;
339 const int code
= icmp_hdr(skb
)->code
;
344 if (flags
&(GRE_CSUM
|GRE_KEY
|GRE_SEQ
|GRE_ROUTING
|GRE_VERSION
)) {
345 if (flags
&(GRE_VERSION
|GRE_ROUTING
))
354 /* If only 8 bytes returned, keyed message will be dropped here */
355 if (skb_headlen(skb
) < grehlen
)
360 case ICMP_PARAMETERPROB
:
363 case ICMP_DEST_UNREACH
:
366 case ICMP_PORT_UNREACH
:
367 /* Impossible event. */
369 case ICMP_FRAG_NEEDED
:
370 /* Soft state for pmtu is maintained by IP core. */
373 /* All others are translated to HOST_UNREACH.
374 rfc2003 contains "deep thoughts" about NET_UNREACH,
375 I believe they are just ether pollution. --ANK
380 case ICMP_TIME_EXCEEDED
:
381 if (code
!= ICMP_EXC_TTL
)
386 read_lock(&ipgre_lock
);
387 t
= ipgre_tunnel_lookup(dev_net(skb
->dev
), iph
->daddr
, iph
->saddr
,
389 *(((__be32
*)p
) + (grehlen
>>2) - 1) : 0);
390 if (t
== NULL
|| t
->parms
.iph
.daddr
== 0 ||
391 ipv4_is_multicast(t
->parms
.iph
.daddr
))
394 if (t
->parms
.iph
.ttl
== 0 && type
== ICMP_TIME_EXCEEDED
)
397 if (jiffies
- t
->err_time
< IPTUNNEL_ERR_TIMEO
)
401 t
->err_time
= jiffies
;
403 read_unlock(&ipgre_lock
);
407 static inline void ipgre_ecn_decapsulate(struct iphdr
*iph
, struct sk_buff
*skb
)
409 if (INET_ECN_is_ce(iph
->tos
)) {
410 if (skb
->protocol
== htons(ETH_P_IP
)) {
411 IP_ECN_set_ce(ip_hdr(skb
));
412 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
413 IP6_ECN_set_ce(ipv6_hdr(skb
));
419 ipgre_ecn_encapsulate(u8 tos
, struct iphdr
*old_iph
, struct sk_buff
*skb
)
422 if (skb
->protocol
== htons(ETH_P_IP
))
423 inner
= old_iph
->tos
;
424 else if (skb
->protocol
== htons(ETH_P_IPV6
))
425 inner
= ipv6_get_dsfield((struct ipv6hdr
*)old_iph
);
426 return INET_ECN_encapsulate(tos
, inner
);
429 static int ipgre_rcv(struct sk_buff
*skb
)
437 struct ip_tunnel
*tunnel
;
440 if (!pskb_may_pull(skb
, 16))
447 if (flags
&(GRE_CSUM
|GRE_KEY
|GRE_ROUTING
|GRE_SEQ
|GRE_VERSION
)) {
448 /* - Version must be 0.
449 - We do not support routing headers.
451 if (flags
&(GRE_VERSION
|GRE_ROUTING
))
454 if (flags
&GRE_CSUM
) {
455 switch (skb
->ip_summed
) {
456 case CHECKSUM_COMPLETE
:
457 csum
= csum_fold(skb
->csum
);
463 csum
= __skb_checksum_complete(skb
);
464 skb
->ip_summed
= CHECKSUM_COMPLETE
;
469 key
= *(__be32
*)(h
+ offset
);
473 seqno
= ntohl(*(__be32
*)(h
+ offset
));
478 read_lock(&ipgre_lock
);
479 if ((tunnel
= ipgre_tunnel_lookup(dev_net(skb
->dev
),
480 iph
->saddr
, iph
->daddr
, key
)) != NULL
) {
481 struct net_device_stats
*stats
= &tunnel
->dev
->stats
;
485 skb
->protocol
= *(__be16
*)(h
+ 2);
486 /* WCCP version 1 and 2 protocol decoding.
487 * - Change protocol to IP
488 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
491 skb
->protocol
== htons(ETH_P_WCCP
)) {
492 skb
->protocol
= htons(ETH_P_IP
);
493 if ((*(h
+ offset
) & 0xF0) != 0x40)
497 skb
->mac_header
= skb
->network_header
;
498 __pskb_pull(skb
, offset
);
499 skb_reset_network_header(skb
);
500 skb_postpull_rcsum(skb
, skb_transport_header(skb
), offset
);
501 skb
->pkt_type
= PACKET_HOST
;
502 #ifdef CONFIG_NET_IPGRE_BROADCAST
503 if (ipv4_is_multicast(iph
->daddr
)) {
504 /* Looped back packet, drop it! */
505 if (skb
->rtable
->fl
.iif
== 0)
508 skb
->pkt_type
= PACKET_BROADCAST
;
512 if (((flags
&GRE_CSUM
) && csum
) ||
513 (!(flags
&GRE_CSUM
) && tunnel
->parms
.i_flags
&GRE_CSUM
)) {
514 stats
->rx_crc_errors
++;
518 if (tunnel
->parms
.i_flags
&GRE_SEQ
) {
519 if (!(flags
&GRE_SEQ
) ||
520 (tunnel
->i_seqno
&& (s32
)(seqno
- tunnel
->i_seqno
) < 0)) {
521 stats
->rx_fifo_errors
++;
525 tunnel
->i_seqno
= seqno
+ 1;
528 stats
->rx_bytes
+= skb
->len
;
529 skb
->dev
= tunnel
->dev
;
530 dst_release(skb
->dst
);
533 ipgre_ecn_decapsulate(iph
, skb
);
535 read_unlock(&ipgre_lock
);
538 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
541 read_unlock(&ipgre_lock
);
547 static int ipgre_tunnel_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
549 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
550 struct net_device_stats
*stats
= &tunnel
->dev
->stats
;
551 struct iphdr
*old_iph
= ip_hdr(skb
);
555 struct rtable
*rt
; /* Route to the other host */
556 struct net_device
*tdev
; /* Device to other host */
557 struct iphdr
*iph
; /* Our new IP header */
558 unsigned int max_headroom
; /* The extra header space needed */
563 if (tunnel
->recursion
++) {
568 if (dev
->header_ops
) {
570 tiph
= (struct iphdr
*)skb
->data
;
572 gre_hlen
= tunnel
->hlen
;
573 tiph
= &tunnel
->parms
.iph
;
576 if ((dst
= tiph
->daddr
) == 0) {
579 if (skb
->dst
== NULL
) {
580 stats
->tx_fifo_errors
++;
584 if (skb
->protocol
== htons(ETH_P_IP
)) {
586 if ((dst
= rt
->rt_gateway
) == 0)
590 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
591 struct in6_addr
*addr6
;
593 struct neighbour
*neigh
= skb
->dst
->neighbour
;
598 addr6
= (struct in6_addr
*)&neigh
->primary_key
;
599 addr_type
= ipv6_addr_type(addr6
);
601 if (addr_type
== IPV6_ADDR_ANY
) {
602 addr6
= &ipv6_hdr(skb
)->daddr
;
603 addr_type
= ipv6_addr_type(addr6
);
606 if ((addr_type
& IPV6_ADDR_COMPATv4
) == 0)
609 dst
= addr6
->s6_addr32
[3];
618 if (skb
->protocol
== htons(ETH_P_IP
))
624 struct flowi fl
= { .oif
= tunnel
->parms
.link
,
627 .saddr
= tiph
->saddr
,
628 .tos
= RT_TOS(tos
) } },
629 .proto
= IPPROTO_GRE
};
630 if (ip_route_output_key(dev_net(dev
), &rt
, &fl
)) {
631 stats
->tx_carrier_errors
++;
635 tdev
= rt
->u
.dst
.dev
;
645 mtu
= dst_mtu(&rt
->u
.dst
) - dev
->hard_header_len
- tunnel
->hlen
;
647 mtu
= skb
->dst
? dst_mtu(skb
->dst
) : dev
->mtu
;
650 skb
->dst
->ops
->update_pmtu(skb
->dst
, mtu
);
652 if (skb
->protocol
== htons(ETH_P_IP
)) {
653 df
|= (old_iph
->frag_off
&htons(IP_DF
));
655 if ((old_iph
->frag_off
&htons(IP_DF
)) &&
656 mtu
< ntohs(old_iph
->tot_len
)) {
657 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
663 else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
664 struct rt6_info
*rt6
= (struct rt6_info
*)skb
->dst
;
666 if (rt6
&& mtu
< dst_mtu(skb
->dst
) && mtu
>= IPV6_MIN_MTU
) {
667 if ((tunnel
->parms
.iph
.daddr
&&
668 !ipv4_is_multicast(tunnel
->parms
.iph
.daddr
)) ||
669 rt6
->rt6i_dst
.plen
== 128) {
670 rt6
->rt6i_flags
|= RTF_MODIFIED
;
671 skb
->dst
->metrics
[RTAX_MTU
-1] = mtu
;
675 if (mtu
>= IPV6_MIN_MTU
&& mtu
< skb
->len
- tunnel
->hlen
+ gre_hlen
) {
676 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
, dev
);
683 if (tunnel
->err_count
> 0) {
684 if (jiffies
- tunnel
->err_time
< IPTUNNEL_ERR_TIMEO
) {
687 dst_link_failure(skb
);
689 tunnel
->err_count
= 0;
692 max_headroom
= LL_RESERVED_SPACE(tdev
) + gre_hlen
;
694 if (skb_headroom(skb
) < max_headroom
|| skb_shared(skb
)||
695 (skb_cloned(skb
) && !skb_clone_writable(skb
, 0))) {
696 struct sk_buff
*new_skb
= skb_realloc_headroom(skb
, max_headroom
);
705 skb_set_owner_w(new_skb
, skb
->sk
);
708 old_iph
= ip_hdr(skb
);
711 skb
->transport_header
= skb
->network_header
;
712 skb_push(skb
, gre_hlen
);
713 skb_reset_network_header(skb
);
714 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
715 IPCB(skb
)->flags
&= ~(IPSKB_XFRM_TUNNEL_SIZE
| IPSKB_XFRM_TRANSFORMED
|
717 dst_release(skb
->dst
);
718 skb
->dst
= &rt
->u
.dst
;
721 * Push down and install the IPIP header.
726 iph
->ihl
= sizeof(struct iphdr
) >> 2;
728 iph
->protocol
= IPPROTO_GRE
;
729 iph
->tos
= ipgre_ecn_encapsulate(tos
, old_iph
, skb
);
730 iph
->daddr
= rt
->rt_dst
;
731 iph
->saddr
= rt
->rt_src
;
733 if ((iph
->ttl
= tiph
->ttl
) == 0) {
734 if (skb
->protocol
== htons(ETH_P_IP
))
735 iph
->ttl
= old_iph
->ttl
;
737 else if (skb
->protocol
== htons(ETH_P_IPV6
))
738 iph
->ttl
= ((struct ipv6hdr
*)old_iph
)->hop_limit
;
741 iph
->ttl
= dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
);
744 ((__be16
*)(iph
+1))[0] = tunnel
->parms
.o_flags
;
745 ((__be16
*)(iph
+1))[1] = skb
->protocol
;
747 if (tunnel
->parms
.o_flags
&(GRE_KEY
|GRE_CSUM
|GRE_SEQ
)) {
748 __be32
*ptr
= (__be32
*)(((u8
*)iph
) + tunnel
->hlen
- 4);
750 if (tunnel
->parms
.o_flags
&GRE_SEQ
) {
752 *ptr
= htonl(tunnel
->o_seqno
);
755 if (tunnel
->parms
.o_flags
&GRE_KEY
) {
756 *ptr
= tunnel
->parms
.o_key
;
759 if (tunnel
->parms
.o_flags
&GRE_CSUM
) {
761 *(__sum16
*)ptr
= ip_compute_csum((void*)(iph
+1), skb
->len
- sizeof(struct iphdr
));
772 dst_link_failure(skb
);
781 static int ipgre_tunnel_bind_dev(struct net_device
*dev
)
783 struct net_device
*tdev
= NULL
;
784 struct ip_tunnel
*tunnel
;
786 int hlen
= LL_MAX_HEADER
;
787 int mtu
= ETH_DATA_LEN
;
788 int addend
= sizeof(struct iphdr
) + 4;
790 tunnel
= netdev_priv(dev
);
791 iph
= &tunnel
->parms
.iph
;
793 /* Guess output device to choose reasonable mtu and needed_headroom */
796 struct flowi fl
= { .oif
= tunnel
->parms
.link
,
798 { .daddr
= iph
->daddr
,
800 .tos
= RT_TOS(iph
->tos
) } },
801 .proto
= IPPROTO_GRE
};
803 if (!ip_route_output_key(dev_net(dev
), &rt
, &fl
)) {
804 tdev
= rt
->u
.dst
.dev
;
807 dev
->flags
|= IFF_POINTOPOINT
;
810 if (!tdev
&& tunnel
->parms
.link
)
811 tdev
= __dev_get_by_index(dev_net(dev
), tunnel
->parms
.link
);
814 hlen
= tdev
->hard_header_len
+ tdev
->needed_headroom
;
817 dev
->iflink
= tunnel
->parms
.link
;
819 /* Precalculate GRE options length */
820 if (tunnel
->parms
.o_flags
&(GRE_CSUM
|GRE_KEY
|GRE_SEQ
)) {
821 if (tunnel
->parms
.o_flags
&GRE_CSUM
)
823 if (tunnel
->parms
.o_flags
&GRE_KEY
)
825 if (tunnel
->parms
.o_flags
&GRE_SEQ
)
828 dev
->needed_headroom
= addend
+ hlen
;
829 mtu
-= dev
->hard_header_len
- addend
;
834 tunnel
->hlen
= addend
;
840 ipgre_tunnel_ioctl (struct net_device
*dev
, struct ifreq
*ifr
, int cmd
)
843 struct ip_tunnel_parm p
;
845 struct net
*net
= dev_net(dev
);
846 struct ipgre_net
*ign
= net_generic(net
, ipgre_net_id
);
851 if (dev
== ign
->fb_tunnel_dev
) {
852 if (copy_from_user(&p
, ifr
->ifr_ifru
.ifru_data
, sizeof(p
))) {
856 t
= ipgre_tunnel_locate(net
, &p
, 0);
859 t
= netdev_priv(dev
);
860 memcpy(&p
, &t
->parms
, sizeof(p
));
861 if (copy_to_user(ifr
->ifr_ifru
.ifru_data
, &p
, sizeof(p
)))
868 if (!capable(CAP_NET_ADMIN
))
872 if (copy_from_user(&p
, ifr
->ifr_ifru
.ifru_data
, sizeof(p
)))
876 if (p
.iph
.version
!= 4 || p
.iph
.protocol
!= IPPROTO_GRE
||
877 p
.iph
.ihl
!= 5 || (p
.iph
.frag_off
&htons(~IP_DF
)) ||
878 ((p
.i_flags
|p
.o_flags
)&(GRE_VERSION
|GRE_ROUTING
)))
881 p
.iph
.frag_off
|= htons(IP_DF
);
883 if (!(p
.i_flags
&GRE_KEY
))
885 if (!(p
.o_flags
&GRE_KEY
))
888 t
= ipgre_tunnel_locate(net
, &p
, cmd
== SIOCADDTUNNEL
);
890 if (dev
!= ign
->fb_tunnel_dev
&& cmd
== SIOCCHGTUNNEL
) {
899 t
= netdev_priv(dev
);
901 if (ipv4_is_multicast(p
.iph
.daddr
))
902 nflags
= IFF_BROADCAST
;
903 else if (p
.iph
.daddr
)
904 nflags
= IFF_POINTOPOINT
;
906 if ((dev
->flags
^nflags
)&(IFF_POINTOPOINT
|IFF_BROADCAST
)) {
910 ipgre_tunnel_unlink(ign
, t
);
911 t
->parms
.iph
.saddr
= p
.iph
.saddr
;
912 t
->parms
.iph
.daddr
= p
.iph
.daddr
;
913 t
->parms
.i_key
= p
.i_key
;
914 t
->parms
.o_key
= p
.o_key
;
915 memcpy(dev
->dev_addr
, &p
.iph
.saddr
, 4);
916 memcpy(dev
->broadcast
, &p
.iph
.daddr
, 4);
917 ipgre_tunnel_link(ign
, t
);
918 netdev_state_change(dev
);
924 if (cmd
== SIOCCHGTUNNEL
) {
925 t
->parms
.iph
.ttl
= p
.iph
.ttl
;
926 t
->parms
.iph
.tos
= p
.iph
.tos
;
927 t
->parms
.iph
.frag_off
= p
.iph
.frag_off
;
928 if (t
->parms
.link
!= p
.link
) {
929 t
->parms
.link
= p
.link
;
930 dev
->mtu
= ipgre_tunnel_bind_dev(dev
);
931 netdev_state_change(dev
);
934 if (copy_to_user(ifr
->ifr_ifru
.ifru_data
, &t
->parms
, sizeof(p
)))
937 err
= (cmd
== SIOCADDTUNNEL
? -ENOBUFS
: -ENOENT
);
942 if (!capable(CAP_NET_ADMIN
))
945 if (dev
== ign
->fb_tunnel_dev
) {
947 if (copy_from_user(&p
, ifr
->ifr_ifru
.ifru_data
, sizeof(p
)))
950 if ((t
= ipgre_tunnel_locate(net
, &p
, 0)) == NULL
)
953 if (t
== netdev_priv(ign
->fb_tunnel_dev
))
957 unregister_netdevice(dev
);
969 static int ipgre_tunnel_change_mtu(struct net_device
*dev
, int new_mtu
)
971 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
973 new_mtu
> 0xFFF8 - dev
->hard_header_len
- tunnel
->hlen
)
979 /* Nice toy. Unfortunately, useless in real life :-)
980 It allows to construct virtual multiprotocol broadcast "LAN"
981 over the Internet, provided multicast routing is tuned.
984 I have no idea was this bicycle invented before me,
985 so that I had to set ARPHRD_IPGRE to a random value.
986 I have an impression, that Cisco could make something similar,
987 but this feature is apparently missing in IOS<=11.2(8).
989 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
990 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
992 ping -t 255 224.66.66.66
994 If nobody answers, mbone does not work.
996 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
997 ip addr add 10.66.66.<somewhat>/24 dev Universe
999 ifconfig Universe add fe80::<Your_real_addr>/10
1000 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1003 ftp fec0:6666:6666::193.233.7.65
1008 static int ipgre_header(struct sk_buff
*skb
, struct net_device
*dev
,
1009 unsigned short type
,
1010 const void *daddr
, const void *saddr
, unsigned len
)
1012 struct ip_tunnel
*t
= netdev_priv(dev
);
1013 struct iphdr
*iph
= (struct iphdr
*)skb_push(skb
, t
->hlen
);
1014 __be16
*p
= (__be16
*)(iph
+1);
1016 memcpy(iph
, &t
->parms
.iph
, sizeof(struct iphdr
));
1017 p
[0] = t
->parms
.o_flags
;
1021 * Set the source hardware address.
1025 memcpy(&iph
->saddr
, saddr
, 4);
1028 memcpy(&iph
->daddr
, daddr
, 4);
1031 if (iph
->daddr
&& !ipv4_is_multicast(iph
->daddr
))
1037 static int ipgre_header_parse(const struct sk_buff
*skb
, unsigned char *haddr
)
1039 struct iphdr
*iph
= (struct iphdr
*) skb_mac_header(skb
);
1040 memcpy(haddr
, &iph
->saddr
, 4);
1044 static const struct header_ops ipgre_header_ops
= {
1045 .create
= ipgre_header
,
1046 .parse
= ipgre_header_parse
,
1049 #ifdef CONFIG_NET_IPGRE_BROADCAST
1050 static int ipgre_open(struct net_device
*dev
)
1052 struct ip_tunnel
*t
= netdev_priv(dev
);
1054 if (ipv4_is_multicast(t
->parms
.iph
.daddr
)) {
1055 struct flowi fl
= { .oif
= t
->parms
.link
,
1057 { .daddr
= t
->parms
.iph
.daddr
,
1058 .saddr
= t
->parms
.iph
.saddr
,
1059 .tos
= RT_TOS(t
->parms
.iph
.tos
) } },
1060 .proto
= IPPROTO_GRE
};
1062 if (ip_route_output_key(dev_net(dev
), &rt
, &fl
))
1063 return -EADDRNOTAVAIL
;
1064 dev
= rt
->u
.dst
.dev
;
1066 if (__in_dev_get_rtnl(dev
) == NULL
)
1067 return -EADDRNOTAVAIL
;
1068 t
->mlink
= dev
->ifindex
;
1069 ip_mc_inc_group(__in_dev_get_rtnl(dev
), t
->parms
.iph
.daddr
);
1074 static int ipgre_close(struct net_device
*dev
)
1076 struct ip_tunnel
*t
= netdev_priv(dev
);
1077 if (ipv4_is_multicast(t
->parms
.iph
.daddr
) && t
->mlink
) {
1078 struct in_device
*in_dev
;
1079 in_dev
= inetdev_by_index(dev_net(dev
), t
->mlink
);
1081 ip_mc_dec_group(in_dev
, t
->parms
.iph
.daddr
);
1090 static void ipgre_tunnel_setup(struct net_device
*dev
)
1092 dev
->init
= ipgre_tunnel_init
;
1093 dev
->uninit
= ipgre_tunnel_uninit
;
1094 dev
->destructor
= free_netdev
;
1095 dev
->hard_start_xmit
= ipgre_tunnel_xmit
;
1096 dev
->do_ioctl
= ipgre_tunnel_ioctl
;
1097 dev
->change_mtu
= ipgre_tunnel_change_mtu
;
1099 dev
->type
= ARPHRD_IPGRE
;
1100 dev
->needed_headroom
= LL_MAX_HEADER
+ sizeof(struct iphdr
) + 4;
1101 dev
->mtu
= ETH_DATA_LEN
- sizeof(struct iphdr
) - 4;
1102 dev
->flags
= IFF_NOARP
;
1105 dev
->features
|= NETIF_F_NETNS_LOCAL
;
1108 static int ipgre_tunnel_init(struct net_device
*dev
)
1110 struct ip_tunnel
*tunnel
;
1113 tunnel
= netdev_priv(dev
);
1114 iph
= &tunnel
->parms
.iph
;
1117 strcpy(tunnel
->parms
.name
, dev
->name
);
1119 memcpy(dev
->dev_addr
, &tunnel
->parms
.iph
.saddr
, 4);
1120 memcpy(dev
->broadcast
, &tunnel
->parms
.iph
.daddr
, 4);
1123 #ifdef CONFIG_NET_IPGRE_BROADCAST
1124 if (ipv4_is_multicast(iph
->daddr
)) {
1127 dev
->flags
= IFF_BROADCAST
;
1128 dev
->header_ops
= &ipgre_header_ops
;
1129 dev
->open
= ipgre_open
;
1130 dev
->stop
= ipgre_close
;
1134 dev
->header_ops
= &ipgre_header_ops
;
1139 static int ipgre_fb_tunnel_init(struct net_device
*dev
)
1141 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
1142 struct iphdr
*iph
= &tunnel
->parms
.iph
;
1143 struct ipgre_net
*ign
= net_generic(dev_net(dev
), ipgre_net_id
);
1146 strcpy(tunnel
->parms
.name
, dev
->name
);
1149 iph
->protocol
= IPPROTO_GRE
;
1151 tunnel
->hlen
= sizeof(struct iphdr
) + 4;
1154 ign
->tunnels_wc
[0] = tunnel
;
1159 static struct net_protocol ipgre_protocol
= {
1160 .handler
= ipgre_rcv
,
1161 .err_handler
= ipgre_err
,
1165 static void ipgre_destroy_tunnels(struct ipgre_net
*ign
)
1169 for (prio
= 0; prio
< 4; prio
++) {
1171 for (h
= 0; h
< HASH_SIZE
; h
++) {
1172 struct ip_tunnel
*t
;
1173 while ((t
= ign
->tunnels
[prio
][h
]) != NULL
)
1174 unregister_netdevice(t
->dev
);
1179 static int ipgre_init_net(struct net
*net
)
1182 struct ipgre_net
*ign
;
1185 ign
= kzalloc(sizeof(struct ipgre_net
), GFP_KERNEL
);
1189 err
= net_assign_generic(net
, ipgre_net_id
, ign
);
1193 ign
->fb_tunnel_dev
= alloc_netdev(sizeof(struct ip_tunnel
), "gre0",
1194 ipgre_tunnel_setup
);
1195 if (!ign
->fb_tunnel_dev
) {
1200 ign
->fb_tunnel_dev
->init
= ipgre_fb_tunnel_init
;
1201 dev_net_set(ign
->fb_tunnel_dev
, net
);
1202 ign
->fb_tunnel_dev
->rtnl_link_ops
= &ipgre_link_ops
;
1204 if ((err
= register_netdev(ign
->fb_tunnel_dev
)))
1210 free_netdev(ign
->fb_tunnel_dev
);
1219 static void ipgre_exit_net(struct net
*net
)
1221 struct ipgre_net
*ign
;
1223 ign
= net_generic(net
, ipgre_net_id
);
1225 ipgre_destroy_tunnels(ign
);
1230 static struct pernet_operations ipgre_net_ops
= {
1231 .init
= ipgre_init_net
,
1232 .exit
= ipgre_exit_net
,
1235 static int ipgre_tunnel_validate(struct nlattr
*tb
[], struct nlattr
*data
[])
1243 if (data
[IFLA_GRE_IFLAGS
])
1244 flags
|= nla_get_be16(data
[IFLA_GRE_IFLAGS
]);
1245 if (data
[IFLA_GRE_OFLAGS
])
1246 flags
|= nla_get_be16(data
[IFLA_GRE_OFLAGS
]);
1247 if (flags
& (GRE_VERSION
|GRE_ROUTING
))
1253 static void ipgre_netlink_parms(struct nlattr
*data
[],
1254 struct ip_tunnel_parm
*parms
)
1256 memset(parms
, 0, sizeof(parms
));
1258 parms
->iph
.protocol
= IPPROTO_GRE
;
1263 if (data
[IFLA_GRE_LINK
])
1264 parms
->link
= nla_get_u32(data
[IFLA_GRE_LINK
]);
1266 if (data
[IFLA_GRE_IFLAGS
])
1267 parms
->i_flags
= nla_get_be16(data
[IFLA_GRE_IFLAGS
]);
1269 if (data
[IFLA_GRE_OFLAGS
])
1270 parms
->o_flags
= nla_get_be16(data
[IFLA_GRE_OFLAGS
]);
1272 if (data
[IFLA_GRE_IKEY
])
1273 parms
->i_key
= nla_get_be32(data
[IFLA_GRE_IKEY
]);
1275 if (data
[IFLA_GRE_OKEY
])
1276 parms
->o_key
= nla_get_be32(data
[IFLA_GRE_OKEY
]);
1278 if (data
[IFLA_GRE_LOCAL
])
1279 memcpy(&parms
->iph
.saddr
, nla_data(data
[IFLA_GRE_LOCAL
]), 4);
1281 if (data
[IFLA_GRE_REMOTE
])
1282 memcpy(&parms
->iph
.daddr
, nla_data(data
[IFLA_GRE_REMOTE
]), 4);
1284 if (data
[IFLA_GRE_TTL
])
1285 parms
->iph
.ttl
= nla_get_u8(data
[IFLA_GRE_TTL
]);
1287 if (data
[IFLA_GRE_TOS
])
1288 parms
->iph
.tos
= nla_get_u8(data
[IFLA_GRE_TOS
]);
1290 if (!data
[IFLA_GRE_PMTUDISC
] || nla_get_u8(data
[IFLA_GRE_PMTUDISC
]))
1291 parms
->iph
.frag_off
= htons(IP_DF
);
1294 static int ipgre_newlink(struct net_device
*dev
, struct nlattr
*tb
[],
1295 struct nlattr
*data
[])
1297 struct ip_tunnel
*nt
;
1298 struct net
*net
= dev_net(dev
);
1299 struct ipgre_net
*ign
= net_generic(net
, ipgre_net_id
);
1303 nt
= netdev_priv(dev
);
1304 ipgre_netlink_parms(data
, &nt
->parms
);
1306 if (ipgre_tunnel_locate(net
, &nt
->parms
, 0))
1309 mtu
= ipgre_tunnel_bind_dev(dev
);
1313 err
= register_netdevice(dev
);
1318 ipgre_tunnel_link(ign
, nt
);
1324 static int ipgre_changelink(struct net_device
*dev
, struct nlattr
*tb
[],
1325 struct nlattr
*data
[])
1327 struct ip_tunnel
*t
, *nt
;
1328 struct net
*net
= dev_net(dev
);
1329 struct ipgre_net
*ign
= net_generic(net
, ipgre_net_id
);
1330 struct ip_tunnel_parm p
;
1333 if (dev
== ign
->fb_tunnel_dev
)
1336 nt
= netdev_priv(dev
);
1337 ipgre_netlink_parms(data
, &p
);
1339 t
= ipgre_tunnel_locate(net
, &p
, 0);
1345 unsigned nflags
= 0;
1349 if (ipv4_is_multicast(p
.iph
.daddr
))
1350 nflags
= IFF_BROADCAST
;
1351 else if (p
.iph
.daddr
)
1352 nflags
= IFF_POINTOPOINT
;
1354 if ((dev
->flags
^ nflags
) &
1355 (IFF_POINTOPOINT
| IFF_BROADCAST
))
1358 ipgre_tunnel_unlink(ign
, t
);
1359 t
->parms
.iph
.saddr
= p
.iph
.saddr
;
1360 t
->parms
.iph
.daddr
= p
.iph
.daddr
;
1361 t
->parms
.i_key
= p
.i_key
;
1362 memcpy(dev
->dev_addr
, &p
.iph
.saddr
, 4);
1363 memcpy(dev
->broadcast
, &p
.iph
.daddr
, 4);
1364 ipgre_tunnel_link(ign
, t
);
1365 netdev_state_change(dev
);
1368 t
->parms
.o_key
= p
.o_key
;
1369 t
->parms
.iph
.ttl
= p
.iph
.ttl
;
1370 t
->parms
.iph
.tos
= p
.iph
.tos
;
1371 t
->parms
.iph
.frag_off
= p
.iph
.frag_off
;
1373 if (t
->parms
.link
!= p
.link
) {
1374 t
->parms
.link
= p
.link
;
1375 mtu
= ipgre_tunnel_bind_dev(dev
);
1378 netdev_state_change(dev
);
1384 static size_t ipgre_get_size(const struct net_device
*dev
)
1389 /* IFLA_GRE_IFLAGS */
1391 /* IFLA_GRE_OFLAGS */
1397 /* IFLA_GRE_LOCAL */
1399 /* IFLA_GRE_REMOTE */
1405 /* IFLA_GRE_PMTUDISC */
1410 static int ipgre_fill_info(struct sk_buff
*skb
, const struct net_device
*dev
)
1412 struct ip_tunnel
*t
= netdev_priv(dev
);
1413 struct ip_tunnel_parm
*p
= &t
->parms
;
1415 NLA_PUT_U32(skb
, IFLA_GRE_LINK
, p
->link
);
1416 NLA_PUT_BE16(skb
, IFLA_GRE_IFLAGS
, p
->i_flags
);
1417 NLA_PUT_BE16(skb
, IFLA_GRE_OFLAGS
, p
->o_flags
);
1418 NLA_PUT_BE32(skb
, IFLA_GRE_IFLAGS
, p
->i_flags
);
1419 NLA_PUT_BE32(skb
, IFLA_GRE_OFLAGS
, p
->o_flags
);
1420 NLA_PUT(skb
, IFLA_GRE_LOCAL
, 4, &p
->iph
.saddr
);
1421 NLA_PUT(skb
, IFLA_GRE_REMOTE
, 4, &p
->iph
.daddr
);
1422 NLA_PUT_U8(skb
, IFLA_GRE_TTL
, p
->iph
.ttl
);
1423 NLA_PUT_U8(skb
, IFLA_GRE_TOS
, p
->iph
.tos
);
1424 NLA_PUT_U8(skb
, IFLA_GRE_PMTUDISC
, !!(p
->iph
.frag_off
& htons(IP_DF
)));
1432 static const struct nla_policy ipgre_policy
[IFLA_GRE_MAX
+ 1] = {
1433 [IFLA_GRE_LINK
] = { .type
= NLA_U32
},
1434 [IFLA_GRE_IFLAGS
] = { .type
= NLA_U16
},
1435 [IFLA_GRE_OFLAGS
] = { .type
= NLA_U16
},
1436 [IFLA_GRE_IKEY
] = { .type
= NLA_U32
},
1437 [IFLA_GRE_OKEY
] = { .type
= NLA_U32
},
1438 [IFLA_GRE_LOCAL
] = { .len
= 4 },
1439 [IFLA_GRE_REMOTE
] = { .len
= 4 },
1440 [IFLA_GRE_TTL
] = { .type
= NLA_U8
},
1441 [IFLA_GRE_TOS
] = { .type
= NLA_U8
},
1442 [IFLA_GRE_PMTUDISC
] = { .type
= NLA_U8
},
1445 static struct rtnl_link_ops ipgre_link_ops __read_mostly
= {
1447 .maxtype
= IFLA_GRE_MAX
,
1448 .policy
= ipgre_policy
,
1449 .priv_size
= sizeof(struct ip_tunnel
),
1450 .setup
= ipgre_tunnel_setup
,
1451 .validate
= ipgre_tunnel_validate
,
1452 .newlink
= ipgre_newlink
,
1453 .changelink
= ipgre_changelink
,
1454 .get_size
= ipgre_get_size
,
1455 .fill_info
= ipgre_fill_info
,
1459 * And now the modules code and kernel interface.
1462 static int __init
ipgre_init(void)
1466 printk(KERN_INFO
"GRE over IPv4 tunneling driver\n");
1468 if (inet_add_protocol(&ipgre_protocol
, IPPROTO_GRE
) < 0) {
1469 printk(KERN_INFO
"ipgre init: can't add protocol\n");
1473 err
= register_pernet_gen_device(&ipgre_net_id
, &ipgre_net_ops
);
1475 goto gen_device_failed
;
1477 err
= rtnl_link_register(&ipgre_link_ops
);
1479 goto rtnl_link_failed
;
1485 unregister_pernet_gen_device(ipgre_net_id
, &ipgre_net_ops
);
1487 inet_del_protocol(&ipgre_protocol
, IPPROTO_GRE
);
1491 static void __exit
ipgre_fini(void)
1493 rtnl_link_unregister(&ipgre_link_ops
);
1494 unregister_pernet_gen_device(ipgre_net_id
, &ipgre_net_ops
);
1495 if (inet_del_protocol(&ipgre_protocol
, IPPROTO_GRE
) < 0)
1496 printk(KERN_INFO
"ipgre close: can't remove protocol\n");
1499 module_init(ipgre_init
);
1500 module_exit(ipgre_fini
);
1501 MODULE_LICENSE("GPL");
1502 MODULE_ALIAS("rtnl-link-gre");