netfilter: nf_conntrack_extend: avoid unnecessary "ct->ext" dereferences
[linux-2.6/mini2440.git] / net / ipv4 / ip_gre.c
blob2a61158ea7226cdff5d6b28fa0cbb0f6b2533d96
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
52 Problems & solutions
53 --------------------
55 1. The most important issue is detecting local dead loops.
56 They would cause complete host lockup in transmit, which
57 would be "resolved" by stack overflow or, if queueing is enabled,
58 with infinite looping in net_bh.
60 We cannot track such dead loops during route installation,
61 it is infeasible task. The most general solutions would be
62 to keep skb->encapsulation counter (sort of local ttl),
63 and silently drop packet when it expires. It is the best
64 solution, but it supposes maintaing new variable in ALL
65 skb, even if no tunneling is used.
67 Current solution: t->recursion lock breaks dead loops. It looks
68 like dev->tbusy flag, but I preferred new variable, because
69 the semantics is different. One day, when hard_start_xmit
70 will be multithreaded we will have to use skb->encapsulation.
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
117 Alexey Kuznetsov.
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
123 /* Fallback tunnel: no source, no destination, no key, no options */
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
127 #define HASH_SIZE 16
129 static int ipgre_net_id;
130 struct ipgre_net {
131 struct ip_tunnel *tunnels[4][HASH_SIZE];
133 struct net_device *fb_tunnel_dev;
136 /* Tunnel hash table */
139 4 hash tables:
141 3: (remote,local)
142 2: (remote,*)
143 1: (*,local)
144 0: (*,*)
146 We require exact key match i.e. if a key is present in packet
147 it will match only tunnel with the same key; if it is not present,
148 it will match only keyless tunnel.
150 All keysless packets, if not matched configured keyless tunnels
151 will match fallback tunnel.
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 #define tunnels_r_l tunnels[3]
157 #define tunnels_r tunnels[2]
158 #define tunnels_l tunnels[1]
159 #define tunnels_wc tunnels[0]
161 static DEFINE_RWLOCK(ipgre_lock);
163 /* Given src, dst and key, find appropriate for input tunnel. */
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166 __be32 remote, __be32 local, __be32 key)
168 unsigned h0 = HASH(remote);
169 unsigned h1 = HASH(key);
170 struct ip_tunnel *t;
171 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
173 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
176 return t;
179 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180 if (remote == t->parms.iph.daddr) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182 return t;
185 for (t = ign->tunnels_l[h1]; t; t = t->next) {
186 if (local == t->parms.iph.saddr ||
187 (local == t->parms.iph.daddr &&
188 ipv4_is_multicast(local))) {
189 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190 return t;
193 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
195 return t;
198 if (ign->fb_tunnel_dev->flags&IFF_UP)
199 return netdev_priv(ign->fb_tunnel_dev);
200 return NULL;
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204 struct ip_tunnel_parm *parms)
206 __be32 remote = parms->iph.daddr;
207 __be32 local = parms->iph.saddr;
208 __be32 key = parms->i_key;
209 unsigned h = HASH(key);
210 int prio = 0;
212 if (local)
213 prio |= 1;
214 if (remote && !ipv4_is_multicast(remote)) {
215 prio |= 2;
216 h ^= HASH(remote);
219 return &ign->tunnels[prio][h];
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
223 struct ip_tunnel *t)
225 return __ipgre_bucket(ign, &t->parms);
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
230 struct ip_tunnel **tp = ipgre_bucket(ign, t);
232 t->next = *tp;
233 write_lock_bh(&ipgre_lock);
234 *tp = t;
235 write_unlock_bh(&ipgre_lock);
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
240 struct ip_tunnel **tp;
242 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243 if (t == *tp) {
244 write_lock_bh(&ipgre_lock);
245 *tp = t->next;
246 write_unlock_bh(&ipgre_lock);
247 break;
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253 struct ip_tunnel_parm *parms, int create)
255 __be32 remote = parms->iph.daddr;
256 __be32 local = parms->iph.saddr;
257 __be32 key = parms->i_key;
258 struct ip_tunnel *t, **tp, *nt;
259 struct net_device *dev;
260 char name[IFNAMSIZ];
261 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
263 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265 if (key == t->parms.i_key)
266 return t;
269 if (!create)
270 return NULL;
272 if (parms->name[0])
273 strlcpy(name, parms->name, IFNAMSIZ);
274 else
275 sprintf(name, "gre%%d");
277 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
278 if (!dev)
279 return NULL;
281 dev_net_set(dev, net);
283 if (strchr(name, '%')) {
284 if (dev_alloc_name(dev, name) < 0)
285 goto failed_free;
288 dev->init = ipgre_tunnel_init;
289 nt = netdev_priv(dev);
290 nt->parms = *parms;
292 if (register_netdevice(dev) < 0)
293 goto failed_free;
295 dev_hold(dev);
296 ipgre_tunnel_link(ign, nt);
297 return nt;
299 failed_free:
300 free_netdev(dev);
301 return NULL;
304 static void ipgre_tunnel_uninit(struct net_device *dev)
306 struct net *net = dev_net(dev);
307 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
309 ipgre_tunnel_unlink(ign, netdev_priv(dev));
310 dev_put(dev);
314 static void ipgre_err(struct sk_buff *skb, u32 info)
317 /* All the routers (except for Linux) return only
318 8 bytes of packet payload. It means, that precise relaying of
319 ICMP in the real Internet is absolutely infeasible.
321 Moreover, Cisco "wise men" put GRE key to the third word
322 in GRE header. It makes impossible maintaining even soft state for keyed
323 GRE tunnels with enabled checksum. Tell them "thank you".
325 Well, I wonder, rfc1812 was written by Cisco employee,
326 what the hell these idiots break standrads established
327 by themself???
330 struct iphdr *iph = (struct iphdr*)skb->data;
331 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
332 int grehlen = (iph->ihl<<2) + 4;
333 const int type = icmp_hdr(skb)->type;
334 const int code = icmp_hdr(skb)->code;
335 struct ip_tunnel *t;
336 __be16 flags;
338 flags = p[0];
339 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
340 if (flags&(GRE_VERSION|GRE_ROUTING))
341 return;
342 if (flags&GRE_KEY) {
343 grehlen += 4;
344 if (flags&GRE_CSUM)
345 grehlen += 4;
349 /* If only 8 bytes returned, keyed message will be dropped here */
350 if (skb_headlen(skb) < grehlen)
351 return;
353 switch (type) {
354 default:
355 case ICMP_PARAMETERPROB:
356 return;
358 case ICMP_DEST_UNREACH:
359 switch (code) {
360 case ICMP_SR_FAILED:
361 case ICMP_PORT_UNREACH:
362 /* Impossible event. */
363 return;
364 case ICMP_FRAG_NEEDED:
365 /* Soft state for pmtu is maintained by IP core. */
366 return;
367 default:
368 /* All others are translated to HOST_UNREACH.
369 rfc2003 contains "deep thoughts" about NET_UNREACH,
370 I believe they are just ether pollution. --ANK
372 break;
374 break;
375 case ICMP_TIME_EXCEEDED:
376 if (code != ICMP_EXC_TTL)
377 return;
378 break;
381 read_lock(&ipgre_lock);
382 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383 (flags&GRE_KEY) ?
384 *(((__be32*)p) + (grehlen>>2) - 1) : 0);
385 if (t == NULL || t->parms.iph.daddr == 0 ||
386 ipv4_is_multicast(t->parms.iph.daddr))
387 goto out;
389 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390 goto out;
392 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393 t->err_count++;
394 else
395 t->err_count = 1;
396 t->err_time = jiffies;
397 out:
398 read_unlock(&ipgre_lock);
399 return;
402 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
404 if (INET_ECN_is_ce(iph->tos)) {
405 if (skb->protocol == htons(ETH_P_IP)) {
406 IP_ECN_set_ce(ip_hdr(skb));
407 } else if (skb->protocol == htons(ETH_P_IPV6)) {
408 IP6_ECN_set_ce(ipv6_hdr(skb));
413 static inline u8
414 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
416 u8 inner = 0;
417 if (skb->protocol == htons(ETH_P_IP))
418 inner = old_iph->tos;
419 else if (skb->protocol == htons(ETH_P_IPV6))
420 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
421 return INET_ECN_encapsulate(tos, inner);
424 static int ipgre_rcv(struct sk_buff *skb)
426 struct iphdr *iph;
427 u8 *h;
428 __be16 flags;
429 __sum16 csum = 0;
430 __be32 key = 0;
431 u32 seqno = 0;
432 struct ip_tunnel *tunnel;
433 int offset = 4;
435 if (!pskb_may_pull(skb, 16))
436 goto drop_nolock;
438 iph = ip_hdr(skb);
439 h = skb->data;
440 flags = *(__be16*)h;
442 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
443 /* - Version must be 0.
444 - We do not support routing headers.
446 if (flags&(GRE_VERSION|GRE_ROUTING))
447 goto drop_nolock;
449 if (flags&GRE_CSUM) {
450 switch (skb->ip_summed) {
451 case CHECKSUM_COMPLETE:
452 csum = csum_fold(skb->csum);
453 if (!csum)
454 break;
455 /* fall through */
456 case CHECKSUM_NONE:
457 skb->csum = 0;
458 csum = __skb_checksum_complete(skb);
459 skb->ip_summed = CHECKSUM_COMPLETE;
461 offset += 4;
463 if (flags&GRE_KEY) {
464 key = *(__be32*)(h + offset);
465 offset += 4;
467 if (flags&GRE_SEQ) {
468 seqno = ntohl(*(__be32*)(h + offset));
469 offset += 4;
473 read_lock(&ipgre_lock);
474 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475 iph->saddr, iph->daddr, key)) != NULL) {
476 struct net_device_stats *stats = &tunnel->dev->stats;
478 secpath_reset(skb);
480 skb->protocol = *(__be16*)(h + 2);
481 /* WCCP version 1 and 2 protocol decoding.
482 * - Change protocol to IP
483 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
485 if (flags == 0 &&
486 skb->protocol == htons(ETH_P_WCCP)) {
487 skb->protocol = htons(ETH_P_IP);
488 if ((*(h + offset) & 0xF0) != 0x40)
489 offset += 4;
492 skb->mac_header = skb->network_header;
493 __pskb_pull(skb, offset);
494 skb_reset_network_header(skb);
495 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
496 skb->pkt_type = PACKET_HOST;
497 #ifdef CONFIG_NET_IPGRE_BROADCAST
498 if (ipv4_is_multicast(iph->daddr)) {
499 /* Looped back packet, drop it! */
500 if (skb->rtable->fl.iif == 0)
501 goto drop;
502 stats->multicast++;
503 skb->pkt_type = PACKET_BROADCAST;
505 #endif
507 if (((flags&GRE_CSUM) && csum) ||
508 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
509 stats->rx_crc_errors++;
510 stats->rx_errors++;
511 goto drop;
513 if (tunnel->parms.i_flags&GRE_SEQ) {
514 if (!(flags&GRE_SEQ) ||
515 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
516 stats->rx_fifo_errors++;
517 stats->rx_errors++;
518 goto drop;
520 tunnel->i_seqno = seqno + 1;
522 stats->rx_packets++;
523 stats->rx_bytes += skb->len;
524 skb->dev = tunnel->dev;
525 dst_release(skb->dst);
526 skb->dst = NULL;
527 nf_reset(skb);
528 ipgre_ecn_decapsulate(iph, skb);
529 netif_rx(skb);
530 read_unlock(&ipgre_lock);
531 return(0);
533 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
535 drop:
536 read_unlock(&ipgre_lock);
537 drop_nolock:
538 kfree_skb(skb);
539 return(0);
542 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
544 struct ip_tunnel *tunnel = netdev_priv(dev);
545 struct net_device_stats *stats = &tunnel->dev->stats;
546 struct iphdr *old_iph = ip_hdr(skb);
547 struct iphdr *tiph;
548 u8 tos;
549 __be16 df;
550 struct rtable *rt; /* Route to the other host */
551 struct net_device *tdev; /* Device to other host */
552 struct iphdr *iph; /* Our new IP header */
553 unsigned int max_headroom; /* The extra header space needed */
554 int gre_hlen;
555 __be32 dst;
556 int mtu;
558 if (tunnel->recursion++) {
559 stats->collisions++;
560 goto tx_error;
563 if (dev->header_ops) {
564 gre_hlen = 0;
565 tiph = (struct iphdr*)skb->data;
566 } else {
567 gre_hlen = tunnel->hlen;
568 tiph = &tunnel->parms.iph;
571 if ((dst = tiph->daddr) == 0) {
572 /* NBMA tunnel */
574 if (skb->dst == NULL) {
575 stats->tx_fifo_errors++;
576 goto tx_error;
579 if (skb->protocol == htons(ETH_P_IP)) {
580 rt = skb->rtable;
581 if ((dst = rt->rt_gateway) == 0)
582 goto tx_error_icmp;
584 #ifdef CONFIG_IPV6
585 else if (skb->protocol == htons(ETH_P_IPV6)) {
586 struct in6_addr *addr6;
587 int addr_type;
588 struct neighbour *neigh = skb->dst->neighbour;
590 if (neigh == NULL)
591 goto tx_error;
593 addr6 = (struct in6_addr*)&neigh->primary_key;
594 addr_type = ipv6_addr_type(addr6);
596 if (addr_type == IPV6_ADDR_ANY) {
597 addr6 = &ipv6_hdr(skb)->daddr;
598 addr_type = ipv6_addr_type(addr6);
601 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
602 goto tx_error_icmp;
604 dst = addr6->s6_addr32[3];
606 #endif
607 else
608 goto tx_error;
611 tos = tiph->tos;
612 if (tos&1) {
613 if (skb->protocol == htons(ETH_P_IP))
614 tos = old_iph->tos;
615 tos &= ~1;
619 struct flowi fl = { .oif = tunnel->parms.link,
620 .nl_u = { .ip4_u =
621 { .daddr = dst,
622 .saddr = tiph->saddr,
623 .tos = RT_TOS(tos) } },
624 .proto = IPPROTO_GRE };
625 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
626 stats->tx_carrier_errors++;
627 goto tx_error;
630 tdev = rt->u.dst.dev;
632 if (tdev == dev) {
633 ip_rt_put(rt);
634 stats->collisions++;
635 goto tx_error;
638 df = tiph->frag_off;
639 if (df)
640 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
641 else
642 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
644 if (skb->dst)
645 skb->dst->ops->update_pmtu(skb->dst, mtu);
647 if (skb->protocol == htons(ETH_P_IP)) {
648 df |= (old_iph->frag_off&htons(IP_DF));
650 if ((old_iph->frag_off&htons(IP_DF)) &&
651 mtu < ntohs(old_iph->tot_len)) {
652 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
653 ip_rt_put(rt);
654 goto tx_error;
657 #ifdef CONFIG_IPV6
658 else if (skb->protocol == htons(ETH_P_IPV6)) {
659 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
661 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
662 if ((tunnel->parms.iph.daddr &&
663 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
664 rt6->rt6i_dst.plen == 128) {
665 rt6->rt6i_flags |= RTF_MODIFIED;
666 skb->dst->metrics[RTAX_MTU-1] = mtu;
670 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
671 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
672 ip_rt_put(rt);
673 goto tx_error;
676 #endif
678 if (tunnel->err_count > 0) {
679 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
680 tunnel->err_count--;
682 dst_link_failure(skb);
683 } else
684 tunnel->err_count = 0;
687 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
689 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
690 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
691 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
692 if (!new_skb) {
693 ip_rt_put(rt);
694 stats->tx_dropped++;
695 dev_kfree_skb(skb);
696 tunnel->recursion--;
697 return 0;
699 if (skb->sk)
700 skb_set_owner_w(new_skb, skb->sk);
701 dev_kfree_skb(skb);
702 skb = new_skb;
703 old_iph = ip_hdr(skb);
706 skb->transport_header = skb->network_header;
707 skb_push(skb, gre_hlen);
708 skb_reset_network_header(skb);
709 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
710 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
711 IPSKB_REROUTED);
712 dst_release(skb->dst);
713 skb->dst = &rt->u.dst;
716 * Push down and install the IPIP header.
719 iph = ip_hdr(skb);
720 iph->version = 4;
721 iph->ihl = sizeof(struct iphdr) >> 2;
722 iph->frag_off = df;
723 iph->protocol = IPPROTO_GRE;
724 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
725 iph->daddr = rt->rt_dst;
726 iph->saddr = rt->rt_src;
728 if ((iph->ttl = tiph->ttl) == 0) {
729 if (skb->protocol == htons(ETH_P_IP))
730 iph->ttl = old_iph->ttl;
731 #ifdef CONFIG_IPV6
732 else if (skb->protocol == htons(ETH_P_IPV6))
733 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
734 #endif
735 else
736 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
739 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
740 ((__be16*)(iph+1))[1] = skb->protocol;
742 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
743 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
745 if (tunnel->parms.o_flags&GRE_SEQ) {
746 ++tunnel->o_seqno;
747 *ptr = htonl(tunnel->o_seqno);
748 ptr--;
750 if (tunnel->parms.o_flags&GRE_KEY) {
751 *ptr = tunnel->parms.o_key;
752 ptr--;
754 if (tunnel->parms.o_flags&GRE_CSUM) {
755 *ptr = 0;
756 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
760 nf_reset(skb);
762 IPTUNNEL_XMIT();
763 tunnel->recursion--;
764 return 0;
766 tx_error_icmp:
767 dst_link_failure(skb);
769 tx_error:
770 stats->tx_errors++;
771 dev_kfree_skb(skb);
772 tunnel->recursion--;
773 return 0;
776 static void ipgre_tunnel_bind_dev(struct net_device *dev)
778 struct net_device *tdev = NULL;
779 struct ip_tunnel *tunnel;
780 struct iphdr *iph;
781 int hlen = LL_MAX_HEADER;
782 int mtu = ETH_DATA_LEN;
783 int addend = sizeof(struct iphdr) + 4;
785 tunnel = netdev_priv(dev);
786 iph = &tunnel->parms.iph;
788 /* Guess output device to choose reasonable mtu and hard_header_len */
790 if (iph->daddr) {
791 struct flowi fl = { .oif = tunnel->parms.link,
792 .nl_u = { .ip4_u =
793 { .daddr = iph->daddr,
794 .saddr = iph->saddr,
795 .tos = RT_TOS(iph->tos) } },
796 .proto = IPPROTO_GRE };
797 struct rtable *rt;
798 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
799 tdev = rt->u.dst.dev;
800 ip_rt_put(rt);
802 dev->flags |= IFF_POINTOPOINT;
805 if (!tdev && tunnel->parms.link)
806 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
808 if (tdev) {
809 hlen = tdev->hard_header_len;
810 mtu = tdev->mtu;
812 dev->iflink = tunnel->parms.link;
814 /* Precalculate GRE options length */
815 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
816 if (tunnel->parms.o_flags&GRE_CSUM)
817 addend += 4;
818 if (tunnel->parms.o_flags&GRE_KEY)
819 addend += 4;
820 if (tunnel->parms.o_flags&GRE_SEQ)
821 addend += 4;
823 dev->hard_header_len = hlen + addend;
824 dev->mtu = mtu - addend;
825 tunnel->hlen = addend;
829 static int
830 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
832 int err = 0;
833 struct ip_tunnel_parm p;
834 struct ip_tunnel *t;
835 struct net *net = dev_net(dev);
836 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
838 switch (cmd) {
839 case SIOCGETTUNNEL:
840 t = NULL;
841 if (dev == ign->fb_tunnel_dev) {
842 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
843 err = -EFAULT;
844 break;
846 t = ipgre_tunnel_locate(net, &p, 0);
848 if (t == NULL)
849 t = netdev_priv(dev);
850 memcpy(&p, &t->parms, sizeof(p));
851 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
852 err = -EFAULT;
853 break;
855 case SIOCADDTUNNEL:
856 case SIOCCHGTUNNEL:
857 err = -EPERM;
858 if (!capable(CAP_NET_ADMIN))
859 goto done;
861 err = -EFAULT;
862 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
863 goto done;
865 err = -EINVAL;
866 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
867 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
868 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
869 goto done;
870 if (p.iph.ttl)
871 p.iph.frag_off |= htons(IP_DF);
873 if (!(p.i_flags&GRE_KEY))
874 p.i_key = 0;
875 if (!(p.o_flags&GRE_KEY))
876 p.o_key = 0;
878 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
880 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
881 if (t != NULL) {
882 if (t->dev != dev) {
883 err = -EEXIST;
884 break;
886 } else {
887 unsigned nflags=0;
889 t = netdev_priv(dev);
891 if (ipv4_is_multicast(p.iph.daddr))
892 nflags = IFF_BROADCAST;
893 else if (p.iph.daddr)
894 nflags = IFF_POINTOPOINT;
896 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
897 err = -EINVAL;
898 break;
900 ipgre_tunnel_unlink(ign, t);
901 t->parms.iph.saddr = p.iph.saddr;
902 t->parms.iph.daddr = p.iph.daddr;
903 t->parms.i_key = p.i_key;
904 t->parms.o_key = p.o_key;
905 memcpy(dev->dev_addr, &p.iph.saddr, 4);
906 memcpy(dev->broadcast, &p.iph.daddr, 4);
907 ipgre_tunnel_link(ign, t);
908 netdev_state_change(dev);
912 if (t) {
913 err = 0;
914 if (cmd == SIOCCHGTUNNEL) {
915 t->parms.iph.ttl = p.iph.ttl;
916 t->parms.iph.tos = p.iph.tos;
917 t->parms.iph.frag_off = p.iph.frag_off;
918 if (t->parms.link != p.link) {
919 t->parms.link = p.link;
920 ipgre_tunnel_bind_dev(dev);
921 netdev_state_change(dev);
924 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
925 err = -EFAULT;
926 } else
927 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
928 break;
930 case SIOCDELTUNNEL:
931 err = -EPERM;
932 if (!capable(CAP_NET_ADMIN))
933 goto done;
935 if (dev == ign->fb_tunnel_dev) {
936 err = -EFAULT;
937 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
938 goto done;
939 err = -ENOENT;
940 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
941 goto done;
942 err = -EPERM;
943 if (t == netdev_priv(ign->fb_tunnel_dev))
944 goto done;
945 dev = t->dev;
947 unregister_netdevice(dev);
948 err = 0;
949 break;
951 default:
952 err = -EINVAL;
955 done:
956 return err;
959 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
961 struct ip_tunnel *tunnel = netdev_priv(dev);
962 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
963 return -EINVAL;
964 dev->mtu = new_mtu;
965 return 0;
968 /* Nice toy. Unfortunately, useless in real life :-)
969 It allows to construct virtual multiprotocol broadcast "LAN"
970 over the Internet, provided multicast routing is tuned.
973 I have no idea was this bicycle invented before me,
974 so that I had to set ARPHRD_IPGRE to a random value.
975 I have an impression, that Cisco could make something similar,
976 but this feature is apparently missing in IOS<=11.2(8).
978 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
979 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
981 ping -t 255 224.66.66.66
983 If nobody answers, mbone does not work.
985 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
986 ip addr add 10.66.66.<somewhat>/24 dev Universe
987 ifconfig Universe up
988 ifconfig Universe add fe80::<Your_real_addr>/10
989 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
990 ftp 10.66.66.66
992 ftp fec0:6666:6666::193.233.7.65
997 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
998 unsigned short type,
999 const void *daddr, const void *saddr, unsigned len)
1001 struct ip_tunnel *t = netdev_priv(dev);
1002 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1003 __be16 *p = (__be16*)(iph+1);
1005 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1006 p[0] = t->parms.o_flags;
1007 p[1] = htons(type);
1010 * Set the source hardware address.
1013 if (saddr)
1014 memcpy(&iph->saddr, saddr, 4);
1016 if (daddr) {
1017 memcpy(&iph->daddr, daddr, 4);
1018 return t->hlen;
1020 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1021 return t->hlen;
1023 return -t->hlen;
1026 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1028 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1029 memcpy(haddr, &iph->saddr, 4);
1030 return 4;
1033 static const struct header_ops ipgre_header_ops = {
1034 .create = ipgre_header,
1035 .parse = ipgre_header_parse,
1038 #ifdef CONFIG_NET_IPGRE_BROADCAST
1039 static int ipgre_open(struct net_device *dev)
1041 struct ip_tunnel *t = netdev_priv(dev);
1043 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1044 struct flowi fl = { .oif = t->parms.link,
1045 .nl_u = { .ip4_u =
1046 { .daddr = t->parms.iph.daddr,
1047 .saddr = t->parms.iph.saddr,
1048 .tos = RT_TOS(t->parms.iph.tos) } },
1049 .proto = IPPROTO_GRE };
1050 struct rtable *rt;
1051 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1052 return -EADDRNOTAVAIL;
1053 dev = rt->u.dst.dev;
1054 ip_rt_put(rt);
1055 if (__in_dev_get_rtnl(dev) == NULL)
1056 return -EADDRNOTAVAIL;
1057 t->mlink = dev->ifindex;
1058 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1060 return 0;
1063 static int ipgre_close(struct net_device *dev)
1065 struct ip_tunnel *t = netdev_priv(dev);
1066 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1067 struct in_device *in_dev;
1068 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1069 if (in_dev) {
1070 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1071 in_dev_put(in_dev);
1074 return 0;
1077 #endif
1079 static void ipgre_tunnel_setup(struct net_device *dev)
1081 dev->uninit = ipgre_tunnel_uninit;
1082 dev->destructor = free_netdev;
1083 dev->hard_start_xmit = ipgre_tunnel_xmit;
1084 dev->do_ioctl = ipgre_tunnel_ioctl;
1085 dev->change_mtu = ipgre_tunnel_change_mtu;
1087 dev->type = ARPHRD_IPGRE;
1088 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1089 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1090 dev->flags = IFF_NOARP;
1091 dev->iflink = 0;
1092 dev->addr_len = 4;
1093 dev->features |= NETIF_F_NETNS_LOCAL;
1096 static int ipgre_tunnel_init(struct net_device *dev)
1098 struct ip_tunnel *tunnel;
1099 struct iphdr *iph;
1101 tunnel = netdev_priv(dev);
1102 iph = &tunnel->parms.iph;
1104 tunnel->dev = dev;
1105 strcpy(tunnel->parms.name, dev->name);
1107 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1108 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1110 ipgre_tunnel_bind_dev(dev);
1112 if (iph->daddr) {
1113 #ifdef CONFIG_NET_IPGRE_BROADCAST
1114 if (ipv4_is_multicast(iph->daddr)) {
1115 if (!iph->saddr)
1116 return -EINVAL;
1117 dev->flags = IFF_BROADCAST;
1118 dev->header_ops = &ipgre_header_ops;
1119 dev->open = ipgre_open;
1120 dev->stop = ipgre_close;
1122 #endif
1123 } else
1124 dev->header_ops = &ipgre_header_ops;
1126 return 0;
1129 static int ipgre_fb_tunnel_init(struct net_device *dev)
1131 struct ip_tunnel *tunnel = netdev_priv(dev);
1132 struct iphdr *iph = &tunnel->parms.iph;
1133 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1135 tunnel->dev = dev;
1136 strcpy(tunnel->parms.name, dev->name);
1138 iph->version = 4;
1139 iph->protocol = IPPROTO_GRE;
1140 iph->ihl = 5;
1141 tunnel->hlen = sizeof(struct iphdr) + 4;
1143 dev_hold(dev);
1144 ign->tunnels_wc[0] = tunnel;
1145 return 0;
1149 static struct net_protocol ipgre_protocol = {
1150 .handler = ipgre_rcv,
1151 .err_handler = ipgre_err,
1152 .netns_ok = 1,
1155 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1157 int prio;
1159 for (prio = 0; prio < 4; prio++) {
1160 int h;
1161 for (h = 0; h < HASH_SIZE; h++) {
1162 struct ip_tunnel *t;
1163 while ((t = ign->tunnels[prio][h]) != NULL)
1164 unregister_netdevice(t->dev);
1169 static int ipgre_init_net(struct net *net)
1171 int err;
1172 struct ipgre_net *ign;
1174 err = -ENOMEM;
1175 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1176 if (ign == NULL)
1177 goto err_alloc;
1179 err = net_assign_generic(net, ipgre_net_id, ign);
1180 if (err < 0)
1181 goto err_assign;
1183 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1184 ipgre_tunnel_setup);
1185 if (!ign->fb_tunnel_dev) {
1186 err = -ENOMEM;
1187 goto err_alloc_dev;
1190 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1191 dev_net_set(ign->fb_tunnel_dev, net);
1193 if ((err = register_netdev(ign->fb_tunnel_dev)))
1194 goto err_reg_dev;
1196 return 0;
1198 err_reg_dev:
1199 free_netdev(ign->fb_tunnel_dev);
1200 err_alloc_dev:
1201 /* nothing */
1202 err_assign:
1203 kfree(ign);
1204 err_alloc:
1205 return err;
1208 static void ipgre_exit_net(struct net *net)
1210 struct ipgre_net *ign;
1212 ign = net_generic(net, ipgre_net_id);
1213 rtnl_lock();
1214 ipgre_destroy_tunnels(ign);
1215 rtnl_unlock();
1216 kfree(ign);
1219 static struct pernet_operations ipgre_net_ops = {
1220 .init = ipgre_init_net,
1221 .exit = ipgre_exit_net,
1225 * And now the modules code and kernel interface.
1228 static int __init ipgre_init(void)
1230 int err;
1232 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1234 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1235 printk(KERN_INFO "ipgre init: can't add protocol\n");
1236 return -EAGAIN;
1239 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1240 if (err < 0)
1241 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1243 return err;
1246 static void __exit ipgre_fini(void)
1248 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1249 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1251 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1254 module_init(ipgre_init);
1255 module_exit(ipgre_fini);
1256 MODULE_LICENSE("GPL");