Linux 2.6.26-rc5
[linux-2.6/openmoko-kernel/knife-kernel.git] / net / ipv4 / ip_gre.c
blob4342cba4ff823bbddfe053d9583f31b9521d735d
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
52 Problems & solutions
53 --------------------
55 1. The most important issue is detecting local dead loops.
56 They would cause complete host lockup in transmit, which
57 would be "resolved" by stack overflow or, if queueing is enabled,
58 with infinite looping in net_bh.
60 We cannot track such dead loops during route installation,
61 it is infeasible task. The most general solutions would be
62 to keep skb->encapsulation counter (sort of local ttl),
63 and silently drop packet when it expires. It is the best
64 solution, but it supposes maintaing new variable in ALL
65 skb, even if no tunneling is used.
67 Current solution: t->recursion lock breaks dead loops. It looks
68 like dev->tbusy flag, but I preferred new variable, because
69 the semantics is different. One day, when hard_start_xmit
70 will be multithreaded we will have to use skb->encapsulation.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
117 Alexey Kuznetsov.
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
123 /* Fallback tunnel: no source, no destination, no key, no options */
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
127 #define HASH_SIZE 16
129 static int ipgre_net_id;
130 struct ipgre_net {
131 struct ip_tunnel *tunnels[4][HASH_SIZE];
133 struct net_device *fb_tunnel_dev;
136 /* Tunnel hash table */
139 4 hash tables:
141 3: (remote,local)
142 2: (remote,*)
143 1: (*,local)
144 0: (*,*)
146 We require exact key match i.e. if a key is present in packet
147 it will match only tunnel with the same key; if it is not present,
148 it will match only keyless tunnel.
150 All keysless packets, if not matched configured keyless tunnels
151 will match fallback tunnel.
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 #define tunnels_r_l tunnels[3]
157 #define tunnels_r tunnels[2]
158 #define tunnels_l tunnels[1]
159 #define tunnels_wc tunnels[0]
161 static DEFINE_RWLOCK(ipgre_lock);
163 /* Given src, dst and key, find appropriate for input tunnel. */
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166 __be32 remote, __be32 local, __be32 key)
168 unsigned h0 = HASH(remote);
169 unsigned h1 = HASH(key);
170 struct ip_tunnel *t;
171 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
173 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
176 return t;
179 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180 if (remote == t->parms.iph.daddr) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182 return t;
185 for (t = ign->tunnels_l[h1]; t; t = t->next) {
186 if (local == t->parms.iph.saddr ||
187 (local == t->parms.iph.daddr &&
188 ipv4_is_multicast(local))) {
189 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190 return t;
193 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
195 return t;
198 if (ign->fb_tunnel_dev->flags&IFF_UP)
199 return netdev_priv(ign->fb_tunnel_dev);
200 return NULL;
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204 struct ip_tunnel_parm *parms)
206 __be32 remote = parms->iph.daddr;
207 __be32 local = parms->iph.saddr;
208 __be32 key = parms->i_key;
209 unsigned h = HASH(key);
210 int prio = 0;
212 if (local)
213 prio |= 1;
214 if (remote && !ipv4_is_multicast(remote)) {
215 prio |= 2;
216 h ^= HASH(remote);
219 return &ign->tunnels[prio][h];
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
223 struct ip_tunnel *t)
225 return __ipgre_bucket(ign, &t->parms);
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
230 struct ip_tunnel **tp = ipgre_bucket(ign, t);
232 t->next = *tp;
233 write_lock_bh(&ipgre_lock);
234 *tp = t;
235 write_unlock_bh(&ipgre_lock);
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
240 struct ip_tunnel **tp;
242 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243 if (t == *tp) {
244 write_lock_bh(&ipgre_lock);
245 *tp = t->next;
246 write_unlock_bh(&ipgre_lock);
247 break;
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253 struct ip_tunnel_parm *parms, int create)
255 __be32 remote = parms->iph.daddr;
256 __be32 local = parms->iph.saddr;
257 __be32 key = parms->i_key;
258 struct ip_tunnel *t, **tp, *nt;
259 struct net_device *dev;
260 char name[IFNAMSIZ];
261 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
263 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265 if (key == t->parms.i_key)
266 return t;
269 if (!create)
270 return NULL;
272 if (parms->name[0])
273 strlcpy(name, parms->name, IFNAMSIZ);
274 else
275 sprintf(name, "gre%%d");
277 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
278 if (!dev)
279 return NULL;
281 dev_net_set(dev, net);
283 if (strchr(name, '%')) {
284 if (dev_alloc_name(dev, name) < 0)
285 goto failed_free;
288 dev->init = ipgre_tunnel_init;
289 nt = netdev_priv(dev);
290 nt->parms = *parms;
292 if (register_netdevice(dev) < 0)
293 goto failed_free;
295 dev_hold(dev);
296 ipgre_tunnel_link(ign, nt);
297 return nt;
299 failed_free:
300 free_netdev(dev);
301 return NULL;
304 static void ipgre_tunnel_uninit(struct net_device *dev)
306 struct net *net = dev_net(dev);
307 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
309 ipgre_tunnel_unlink(ign, netdev_priv(dev));
310 dev_put(dev);
314 static void ipgre_err(struct sk_buff *skb, u32 info)
317 /* All the routers (except for Linux) return only
318 8 bytes of packet payload. It means, that precise relaying of
319 ICMP in the real Internet is absolutely infeasible.
321 Moreover, Cisco "wise men" put GRE key to the third word
322 in GRE header. It makes impossible maintaining even soft state for keyed
323 GRE tunnels with enabled checksum. Tell them "thank you".
325 Well, I wonder, rfc1812 was written by Cisco employee,
326 what the hell these idiots break standrads established
327 by themself???
330 struct iphdr *iph = (struct iphdr*)skb->data;
331 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
332 int grehlen = (iph->ihl<<2) + 4;
333 const int type = icmp_hdr(skb)->type;
334 const int code = icmp_hdr(skb)->code;
335 struct ip_tunnel *t;
336 __be16 flags;
338 flags = p[0];
339 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
340 if (flags&(GRE_VERSION|GRE_ROUTING))
341 return;
342 if (flags&GRE_KEY) {
343 grehlen += 4;
344 if (flags&GRE_CSUM)
345 grehlen += 4;
349 /* If only 8 bytes returned, keyed message will be dropped here */
350 if (skb_headlen(skb) < grehlen)
351 return;
353 switch (type) {
354 default:
355 case ICMP_PARAMETERPROB:
356 return;
358 case ICMP_DEST_UNREACH:
359 switch (code) {
360 case ICMP_SR_FAILED:
361 case ICMP_PORT_UNREACH:
362 /* Impossible event. */
363 return;
364 case ICMP_FRAG_NEEDED:
365 /* Soft state for pmtu is maintained by IP core. */
366 return;
367 default:
368 /* All others are translated to HOST_UNREACH.
369 rfc2003 contains "deep thoughts" about NET_UNREACH,
370 I believe they are just ether pollution. --ANK
372 break;
374 break;
375 case ICMP_TIME_EXCEEDED:
376 if (code != ICMP_EXC_TTL)
377 return;
378 break;
381 read_lock(&ipgre_lock);
382 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383 (flags&GRE_KEY) ?
384 *(((__be32*)p) + (grehlen>>2) - 1) : 0);
385 if (t == NULL || t->parms.iph.daddr == 0 ||
386 ipv4_is_multicast(t->parms.iph.daddr))
387 goto out;
389 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390 goto out;
392 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393 t->err_count++;
394 else
395 t->err_count = 1;
396 t->err_time = jiffies;
397 out:
398 read_unlock(&ipgre_lock);
399 return;
402 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
404 if (INET_ECN_is_ce(iph->tos)) {
405 if (skb->protocol == htons(ETH_P_IP)) {
406 IP_ECN_set_ce(ip_hdr(skb));
407 } else if (skb->protocol == htons(ETH_P_IPV6)) {
408 IP6_ECN_set_ce(ipv6_hdr(skb));
413 static inline u8
414 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
416 u8 inner = 0;
417 if (skb->protocol == htons(ETH_P_IP))
418 inner = old_iph->tos;
419 else if (skb->protocol == htons(ETH_P_IPV6))
420 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
421 return INET_ECN_encapsulate(tos, inner);
424 static int ipgre_rcv(struct sk_buff *skb)
426 struct iphdr *iph;
427 u8 *h;
428 __be16 flags;
429 __sum16 csum = 0;
430 __be32 key = 0;
431 u32 seqno = 0;
432 struct ip_tunnel *tunnel;
433 int offset = 4;
435 if (!pskb_may_pull(skb, 16))
436 goto drop_nolock;
438 iph = ip_hdr(skb);
439 h = skb->data;
440 flags = *(__be16*)h;
442 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
443 /* - Version must be 0.
444 - We do not support routing headers.
446 if (flags&(GRE_VERSION|GRE_ROUTING))
447 goto drop_nolock;
449 if (flags&GRE_CSUM) {
450 switch (skb->ip_summed) {
451 case CHECKSUM_COMPLETE:
452 csum = csum_fold(skb->csum);
453 if (!csum)
454 break;
455 /* fall through */
456 case CHECKSUM_NONE:
457 skb->csum = 0;
458 csum = __skb_checksum_complete(skb);
459 skb->ip_summed = CHECKSUM_COMPLETE;
461 offset += 4;
463 if (flags&GRE_KEY) {
464 key = *(__be32*)(h + offset);
465 offset += 4;
467 if (flags&GRE_SEQ) {
468 seqno = ntohl(*(__be32*)(h + offset));
469 offset += 4;
473 read_lock(&ipgre_lock);
474 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475 iph->saddr, iph->daddr, key)) != NULL) {
476 secpath_reset(skb);
478 skb->protocol = *(__be16*)(h + 2);
479 /* WCCP version 1 and 2 protocol decoding.
480 * - Change protocol to IP
481 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
483 if (flags == 0 &&
484 skb->protocol == htons(ETH_P_WCCP)) {
485 skb->protocol = htons(ETH_P_IP);
486 if ((*(h + offset) & 0xF0) != 0x40)
487 offset += 4;
490 skb->mac_header = skb->network_header;
491 __pskb_pull(skb, offset);
492 skb_reset_network_header(skb);
493 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
494 skb->pkt_type = PACKET_HOST;
495 #ifdef CONFIG_NET_IPGRE_BROADCAST
496 if (ipv4_is_multicast(iph->daddr)) {
497 /* Looped back packet, drop it! */
498 if (skb->rtable->fl.iif == 0)
499 goto drop;
500 tunnel->stat.multicast++;
501 skb->pkt_type = PACKET_BROADCAST;
503 #endif
505 if (((flags&GRE_CSUM) && csum) ||
506 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
507 tunnel->stat.rx_crc_errors++;
508 tunnel->stat.rx_errors++;
509 goto drop;
511 if (tunnel->parms.i_flags&GRE_SEQ) {
512 if (!(flags&GRE_SEQ) ||
513 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
514 tunnel->stat.rx_fifo_errors++;
515 tunnel->stat.rx_errors++;
516 goto drop;
518 tunnel->i_seqno = seqno + 1;
520 tunnel->stat.rx_packets++;
521 tunnel->stat.rx_bytes += skb->len;
522 skb->dev = tunnel->dev;
523 dst_release(skb->dst);
524 skb->dst = NULL;
525 nf_reset(skb);
526 ipgre_ecn_decapsulate(iph, skb);
527 netif_rx(skb);
528 read_unlock(&ipgre_lock);
529 return(0);
531 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
533 drop:
534 read_unlock(&ipgre_lock);
535 drop_nolock:
536 kfree_skb(skb);
537 return(0);
540 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
542 struct ip_tunnel *tunnel = netdev_priv(dev);
543 struct net_device_stats *stats = &tunnel->stat;
544 struct iphdr *old_iph = ip_hdr(skb);
545 struct iphdr *tiph;
546 u8 tos;
547 __be16 df;
548 struct rtable *rt; /* Route to the other host */
549 struct net_device *tdev; /* Device to other host */
550 struct iphdr *iph; /* Our new IP header */
551 unsigned int max_headroom; /* The extra header space needed */
552 int gre_hlen;
553 __be32 dst;
554 int mtu;
556 if (tunnel->recursion++) {
557 tunnel->stat.collisions++;
558 goto tx_error;
561 if (dev->header_ops) {
562 gre_hlen = 0;
563 tiph = (struct iphdr*)skb->data;
564 } else {
565 gre_hlen = tunnel->hlen;
566 tiph = &tunnel->parms.iph;
569 if ((dst = tiph->daddr) == 0) {
570 /* NBMA tunnel */
572 if (skb->dst == NULL) {
573 tunnel->stat.tx_fifo_errors++;
574 goto tx_error;
577 if (skb->protocol == htons(ETH_P_IP)) {
578 rt = skb->rtable;
579 if ((dst = rt->rt_gateway) == 0)
580 goto tx_error_icmp;
582 #ifdef CONFIG_IPV6
583 else if (skb->protocol == htons(ETH_P_IPV6)) {
584 struct in6_addr *addr6;
585 int addr_type;
586 struct neighbour *neigh = skb->dst->neighbour;
588 if (neigh == NULL)
589 goto tx_error;
591 addr6 = (struct in6_addr*)&neigh->primary_key;
592 addr_type = ipv6_addr_type(addr6);
594 if (addr_type == IPV6_ADDR_ANY) {
595 addr6 = &ipv6_hdr(skb)->daddr;
596 addr_type = ipv6_addr_type(addr6);
599 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
600 goto tx_error_icmp;
602 dst = addr6->s6_addr32[3];
604 #endif
605 else
606 goto tx_error;
609 tos = tiph->tos;
610 if (tos&1) {
611 if (skb->protocol == htons(ETH_P_IP))
612 tos = old_iph->tos;
613 tos &= ~1;
617 struct flowi fl = { .oif = tunnel->parms.link,
618 .nl_u = { .ip4_u =
619 { .daddr = dst,
620 .saddr = tiph->saddr,
621 .tos = RT_TOS(tos) } },
622 .proto = IPPROTO_GRE };
623 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
624 tunnel->stat.tx_carrier_errors++;
625 goto tx_error;
628 tdev = rt->u.dst.dev;
630 if (tdev == dev) {
631 ip_rt_put(rt);
632 tunnel->stat.collisions++;
633 goto tx_error;
636 df = tiph->frag_off;
637 if (df)
638 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
639 else
640 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
642 if (skb->dst)
643 skb->dst->ops->update_pmtu(skb->dst, mtu);
645 if (skb->protocol == htons(ETH_P_IP)) {
646 df |= (old_iph->frag_off&htons(IP_DF));
648 if ((old_iph->frag_off&htons(IP_DF)) &&
649 mtu < ntohs(old_iph->tot_len)) {
650 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
651 ip_rt_put(rt);
652 goto tx_error;
655 #ifdef CONFIG_IPV6
656 else if (skb->protocol == htons(ETH_P_IPV6)) {
657 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
659 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
660 if ((tunnel->parms.iph.daddr &&
661 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
662 rt6->rt6i_dst.plen == 128) {
663 rt6->rt6i_flags |= RTF_MODIFIED;
664 skb->dst->metrics[RTAX_MTU-1] = mtu;
668 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
669 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
670 ip_rt_put(rt);
671 goto tx_error;
674 #endif
676 if (tunnel->err_count > 0) {
677 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
678 tunnel->err_count--;
680 dst_link_failure(skb);
681 } else
682 tunnel->err_count = 0;
685 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
687 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
688 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
689 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
690 if (!new_skb) {
691 ip_rt_put(rt);
692 stats->tx_dropped++;
693 dev_kfree_skb(skb);
694 tunnel->recursion--;
695 return 0;
697 if (skb->sk)
698 skb_set_owner_w(new_skb, skb->sk);
699 dev_kfree_skb(skb);
700 skb = new_skb;
701 old_iph = ip_hdr(skb);
704 skb->transport_header = skb->network_header;
705 skb_push(skb, gre_hlen);
706 skb_reset_network_header(skb);
707 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
708 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
709 IPSKB_REROUTED);
710 dst_release(skb->dst);
711 skb->dst = &rt->u.dst;
714 * Push down and install the IPIP header.
717 iph = ip_hdr(skb);
718 iph->version = 4;
719 iph->ihl = sizeof(struct iphdr) >> 2;
720 iph->frag_off = df;
721 iph->protocol = IPPROTO_GRE;
722 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
723 iph->daddr = rt->rt_dst;
724 iph->saddr = rt->rt_src;
726 if ((iph->ttl = tiph->ttl) == 0) {
727 if (skb->protocol == htons(ETH_P_IP))
728 iph->ttl = old_iph->ttl;
729 #ifdef CONFIG_IPV6
730 else if (skb->protocol == htons(ETH_P_IPV6))
731 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
732 #endif
733 else
734 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
737 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
738 ((__be16*)(iph+1))[1] = skb->protocol;
740 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
741 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
743 if (tunnel->parms.o_flags&GRE_SEQ) {
744 ++tunnel->o_seqno;
745 *ptr = htonl(tunnel->o_seqno);
746 ptr--;
748 if (tunnel->parms.o_flags&GRE_KEY) {
749 *ptr = tunnel->parms.o_key;
750 ptr--;
752 if (tunnel->parms.o_flags&GRE_CSUM) {
753 *ptr = 0;
754 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
758 nf_reset(skb);
760 IPTUNNEL_XMIT();
761 tunnel->recursion--;
762 return 0;
764 tx_error_icmp:
765 dst_link_failure(skb);
767 tx_error:
768 stats->tx_errors++;
769 dev_kfree_skb(skb);
770 tunnel->recursion--;
771 return 0;
774 static void ipgre_tunnel_bind_dev(struct net_device *dev)
776 struct net_device *tdev = NULL;
777 struct ip_tunnel *tunnel;
778 struct iphdr *iph;
779 int hlen = LL_MAX_HEADER;
780 int mtu = ETH_DATA_LEN;
781 int addend = sizeof(struct iphdr) + 4;
783 tunnel = netdev_priv(dev);
784 iph = &tunnel->parms.iph;
786 /* Guess output device to choose reasonable mtu and hard_header_len */
788 if (iph->daddr) {
789 struct flowi fl = { .oif = tunnel->parms.link,
790 .nl_u = { .ip4_u =
791 { .daddr = iph->daddr,
792 .saddr = iph->saddr,
793 .tos = RT_TOS(iph->tos) } },
794 .proto = IPPROTO_GRE };
795 struct rtable *rt;
796 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
797 tdev = rt->u.dst.dev;
798 ip_rt_put(rt);
800 dev->flags |= IFF_POINTOPOINT;
803 if (!tdev && tunnel->parms.link)
804 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
806 if (tdev) {
807 hlen = tdev->hard_header_len;
808 mtu = tdev->mtu;
810 dev->iflink = tunnel->parms.link;
812 /* Precalculate GRE options length */
813 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
814 if (tunnel->parms.o_flags&GRE_CSUM)
815 addend += 4;
816 if (tunnel->parms.o_flags&GRE_KEY)
817 addend += 4;
818 if (tunnel->parms.o_flags&GRE_SEQ)
819 addend += 4;
821 dev->hard_header_len = hlen + addend;
822 dev->mtu = mtu - addend;
823 tunnel->hlen = addend;
827 static int
828 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
830 int err = 0;
831 struct ip_tunnel_parm p;
832 struct ip_tunnel *t;
833 struct net *net = dev_net(dev);
834 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
836 switch (cmd) {
837 case SIOCGETTUNNEL:
838 t = NULL;
839 if (dev == ign->fb_tunnel_dev) {
840 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
841 err = -EFAULT;
842 break;
844 t = ipgre_tunnel_locate(net, &p, 0);
846 if (t == NULL)
847 t = netdev_priv(dev);
848 memcpy(&p, &t->parms, sizeof(p));
849 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
850 err = -EFAULT;
851 break;
853 case SIOCADDTUNNEL:
854 case SIOCCHGTUNNEL:
855 err = -EPERM;
856 if (!capable(CAP_NET_ADMIN))
857 goto done;
859 err = -EFAULT;
860 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
861 goto done;
863 err = -EINVAL;
864 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
865 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
866 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
867 goto done;
868 if (p.iph.ttl)
869 p.iph.frag_off |= htons(IP_DF);
871 if (!(p.i_flags&GRE_KEY))
872 p.i_key = 0;
873 if (!(p.o_flags&GRE_KEY))
874 p.o_key = 0;
876 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
878 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
879 if (t != NULL) {
880 if (t->dev != dev) {
881 err = -EEXIST;
882 break;
884 } else {
885 unsigned nflags=0;
887 t = netdev_priv(dev);
889 if (ipv4_is_multicast(p.iph.daddr))
890 nflags = IFF_BROADCAST;
891 else if (p.iph.daddr)
892 nflags = IFF_POINTOPOINT;
894 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
895 err = -EINVAL;
896 break;
898 ipgre_tunnel_unlink(ign, t);
899 t->parms.iph.saddr = p.iph.saddr;
900 t->parms.iph.daddr = p.iph.daddr;
901 t->parms.i_key = p.i_key;
902 t->parms.o_key = p.o_key;
903 memcpy(dev->dev_addr, &p.iph.saddr, 4);
904 memcpy(dev->broadcast, &p.iph.daddr, 4);
905 ipgre_tunnel_link(ign, t);
906 netdev_state_change(dev);
910 if (t) {
911 err = 0;
912 if (cmd == SIOCCHGTUNNEL) {
913 t->parms.iph.ttl = p.iph.ttl;
914 t->parms.iph.tos = p.iph.tos;
915 t->parms.iph.frag_off = p.iph.frag_off;
916 if (t->parms.link != p.link) {
917 t->parms.link = p.link;
918 ipgre_tunnel_bind_dev(dev);
919 netdev_state_change(dev);
922 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
923 err = -EFAULT;
924 } else
925 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
926 break;
928 case SIOCDELTUNNEL:
929 err = -EPERM;
930 if (!capable(CAP_NET_ADMIN))
931 goto done;
933 if (dev == ign->fb_tunnel_dev) {
934 err = -EFAULT;
935 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
936 goto done;
937 err = -ENOENT;
938 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
939 goto done;
940 err = -EPERM;
941 if (t == netdev_priv(ign->fb_tunnel_dev))
942 goto done;
943 dev = t->dev;
945 unregister_netdevice(dev);
946 err = 0;
947 break;
949 default:
950 err = -EINVAL;
953 done:
954 return err;
957 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
959 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
962 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
964 struct ip_tunnel *tunnel = netdev_priv(dev);
965 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
966 return -EINVAL;
967 dev->mtu = new_mtu;
968 return 0;
971 /* Nice toy. Unfortunately, useless in real life :-)
972 It allows to construct virtual multiprotocol broadcast "LAN"
973 over the Internet, provided multicast routing is tuned.
976 I have no idea was this bicycle invented before me,
977 so that I had to set ARPHRD_IPGRE to a random value.
978 I have an impression, that Cisco could make something similar,
979 but this feature is apparently missing in IOS<=11.2(8).
981 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
982 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
984 ping -t 255 224.66.66.66
986 If nobody answers, mbone does not work.
988 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
989 ip addr add 10.66.66.<somewhat>/24 dev Universe
990 ifconfig Universe up
991 ifconfig Universe add fe80::<Your_real_addr>/10
992 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
993 ftp 10.66.66.66
995 ftp fec0:6666:6666::193.233.7.65
1000 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1001 unsigned short type,
1002 const void *daddr, const void *saddr, unsigned len)
1004 struct ip_tunnel *t = netdev_priv(dev);
1005 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1006 __be16 *p = (__be16*)(iph+1);
1008 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1009 p[0] = t->parms.o_flags;
1010 p[1] = htons(type);
1013 * Set the source hardware address.
1016 if (saddr)
1017 memcpy(&iph->saddr, saddr, 4);
1019 if (daddr) {
1020 memcpy(&iph->daddr, daddr, 4);
1021 return t->hlen;
1023 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1024 return t->hlen;
1026 return -t->hlen;
1029 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1031 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1032 memcpy(haddr, &iph->saddr, 4);
1033 return 4;
1036 static const struct header_ops ipgre_header_ops = {
1037 .create = ipgre_header,
1038 .parse = ipgre_header_parse,
1041 #ifdef CONFIG_NET_IPGRE_BROADCAST
1042 static int ipgre_open(struct net_device *dev)
1044 struct ip_tunnel *t = netdev_priv(dev);
1046 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1047 struct flowi fl = { .oif = t->parms.link,
1048 .nl_u = { .ip4_u =
1049 { .daddr = t->parms.iph.daddr,
1050 .saddr = t->parms.iph.saddr,
1051 .tos = RT_TOS(t->parms.iph.tos) } },
1052 .proto = IPPROTO_GRE };
1053 struct rtable *rt;
1054 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1055 return -EADDRNOTAVAIL;
1056 dev = rt->u.dst.dev;
1057 ip_rt_put(rt);
1058 if (__in_dev_get_rtnl(dev) == NULL)
1059 return -EADDRNOTAVAIL;
1060 t->mlink = dev->ifindex;
1061 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1063 return 0;
1066 static int ipgre_close(struct net_device *dev)
1068 struct ip_tunnel *t = netdev_priv(dev);
1069 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1070 struct in_device *in_dev;
1071 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1072 if (in_dev) {
1073 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1074 in_dev_put(in_dev);
1077 return 0;
1080 #endif
1082 static void ipgre_tunnel_setup(struct net_device *dev)
1084 dev->uninit = ipgre_tunnel_uninit;
1085 dev->destructor = free_netdev;
1086 dev->hard_start_xmit = ipgre_tunnel_xmit;
1087 dev->get_stats = ipgre_tunnel_get_stats;
1088 dev->do_ioctl = ipgre_tunnel_ioctl;
1089 dev->change_mtu = ipgre_tunnel_change_mtu;
1091 dev->type = ARPHRD_IPGRE;
1092 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1093 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1094 dev->flags = IFF_NOARP;
1095 dev->iflink = 0;
1096 dev->addr_len = 4;
1097 dev->features |= NETIF_F_NETNS_LOCAL;
1100 static int ipgre_tunnel_init(struct net_device *dev)
1102 struct ip_tunnel *tunnel;
1103 struct iphdr *iph;
1105 tunnel = netdev_priv(dev);
1106 iph = &tunnel->parms.iph;
1108 tunnel->dev = dev;
1109 strcpy(tunnel->parms.name, dev->name);
1111 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1112 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1114 ipgre_tunnel_bind_dev(dev);
1116 if (iph->daddr) {
1117 #ifdef CONFIG_NET_IPGRE_BROADCAST
1118 if (ipv4_is_multicast(iph->daddr)) {
1119 if (!iph->saddr)
1120 return -EINVAL;
1121 dev->flags = IFF_BROADCAST;
1122 dev->header_ops = &ipgre_header_ops;
1123 dev->open = ipgre_open;
1124 dev->stop = ipgre_close;
1126 #endif
1127 } else
1128 dev->header_ops = &ipgre_header_ops;
1130 return 0;
1133 static int ipgre_fb_tunnel_init(struct net_device *dev)
1135 struct ip_tunnel *tunnel = netdev_priv(dev);
1136 struct iphdr *iph = &tunnel->parms.iph;
1137 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1139 tunnel->dev = dev;
1140 strcpy(tunnel->parms.name, dev->name);
1142 iph->version = 4;
1143 iph->protocol = IPPROTO_GRE;
1144 iph->ihl = 5;
1145 tunnel->hlen = sizeof(struct iphdr) + 4;
1147 dev_hold(dev);
1148 ign->tunnels_wc[0] = tunnel;
1149 return 0;
1153 static struct net_protocol ipgre_protocol = {
1154 .handler = ipgre_rcv,
1155 .err_handler = ipgre_err,
1156 .netns_ok = 1,
1159 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1161 int prio;
1163 for (prio = 0; prio < 4; prio++) {
1164 int h;
1165 for (h = 0; h < HASH_SIZE; h++) {
1166 struct ip_tunnel *t;
1167 while ((t = ign->tunnels[prio][h]) != NULL)
1168 unregister_netdevice(t->dev);
1173 static int ipgre_init_net(struct net *net)
1175 int err;
1176 struct ipgre_net *ign;
1178 err = -ENOMEM;
1179 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1180 if (ign == NULL)
1181 goto err_alloc;
1183 err = net_assign_generic(net, ipgre_net_id, ign);
1184 if (err < 0)
1185 goto err_assign;
1187 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1188 ipgre_tunnel_setup);
1189 if (!ign->fb_tunnel_dev) {
1190 err = -ENOMEM;
1191 goto err_alloc_dev;
1194 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1195 dev_net_set(ign->fb_tunnel_dev, net);
1197 if ((err = register_netdev(ign->fb_tunnel_dev)))
1198 goto err_reg_dev;
1200 return 0;
1202 err_reg_dev:
1203 free_netdev(ign->fb_tunnel_dev);
1204 err_alloc_dev:
1205 /* nothing */
1206 err_assign:
1207 kfree(ign);
1208 err_alloc:
1209 return err;
1212 static void ipgre_exit_net(struct net *net)
1214 struct ipgre_net *ign;
1216 ign = net_generic(net, ipgre_net_id);
1217 rtnl_lock();
1218 ipgre_destroy_tunnels(ign);
1219 rtnl_unlock();
1220 kfree(ign);
1223 static struct pernet_operations ipgre_net_ops = {
1224 .init = ipgre_init_net,
1225 .exit = ipgre_exit_net,
1229 * And now the modules code and kernel interface.
1232 static int __init ipgre_init(void)
1234 int err;
1236 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1238 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1239 printk(KERN_INFO "ipgre init: can't add protocol\n");
1240 return -EAGAIN;
1243 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1244 if (err < 0)
1245 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1247 return err;
1250 static void __exit ipgre_fini(void)
1252 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1253 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1255 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1258 module_init(ipgre_init);
1259 module_exit(ipgre_fini);
1260 MODULE_LICENSE("GPL");