MOXA linux-2.6.x / linux-2.6.19-uc1 from UC-7110-LX-BOOTLOADER-1.9_VERSION-4.2.tgz
[linux-2.6.19-moxart.git] / net / ipv4 / ip_gre.c
blob690498136bbb73168391bba46e9bece981616168
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_bridge.h>
33 #include <linux/etherdevice.h>
34 #include <linux/llc.h>
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
54 Problems & solutions
55 --------------------
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: t->recursion lock breaks dead loops. It looks
70 like dev->tbusy flag, but I preferred new variable, because
71 the semantics is different. One day, when hard_start_xmit
72 will be multithreaded we will have to use skb->encapsulation.
76 2. Networking dead loops would not kill routers, but would really
77 kill network. IP hop limit plays role of "t->recursion" in this case,
78 if we copy it from packet being encapsulated to upper header.
79 It is very good solution, but it introduces two problems:
81 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82 do not work over tunnels.
83 - traceroute does not work. I planned to relay ICMP from tunnel,
84 so that this problem would be solved and traceroute output
85 would even more informative. This idea appeared to be wrong:
86 only Linux complies to rfc1812 now (yes, guys, Linux is the only
87 true router now :-)), all routers (at least, in neighbourhood of mine)
88 return only 8 bytes of payload. It is the end.
90 Hence, if we want that OSPF worked or traceroute said something reasonable,
91 we should search for another solution.
93 One of them is to parse packet trying to detect inner encapsulation
94 made by our node. It is difficult or even impossible, especially,
95 taking into account fragmentation. TO be short, tt is not solution at all.
97 Current solution: The solution was UNEXPECTEDLY SIMPLE.
98 We force DF flag on tunnels with preconfigured hop limit,
99 that is ALL. :-) Well, it does not remove the problem completely,
100 but exponential growth of network traffic is changed to linear
101 (branches, that exceed pmtu are pruned) and tunnel mtu
102 fastly degrades to value <68, where looping stops.
103 Yes, it is not good if there exists a router in the loop,
104 which does not force DF, even when encapsulating packets have DF set.
105 But it is not our problem! Nobody could accuse us, we made
106 all that we could make. Even if it is your gated who injected
107 fatal route to network, even if it were you who configured
108 fatal static route: you are innocent. :-)
112 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113 practically identical code. It would be good to glue them
114 together, but it is not very evident, how to make them modular.
115 sit is integral part of IPv6, ipip and gre are naturally modular.
116 We could extract common parts (hash table, ioctl etc)
117 to a separate module (ip_tunnel.c).
119 Alexey Kuznetsov.
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static void ipgre_ether_tunnel_setup(struct net_device *dev);
126 /* Fallback tunnel: no source, no destination, no key, no options */
128 static int ipgre_fb_tunnel_init(struct net_device *dev);
130 static struct net_device *ipgre_fb_tunnel_dev;
132 /* Tunnel hash table */
135 4 hash tables:
137 3: (remote,local)
138 2: (remote,*)
139 1: (*,local)
140 0: (*,*)
142 We require exact key match i.e. if a key is present in packet
143 it will match only tunnel with the same key; if it is not present,
144 it will match only keyless tunnel.
146 All keysless packets, if not matched configured keyless tunnels
147 will match fallback tunnel.
150 #define HASH_SIZE 16
151 #define HASH(addr) ((addr^(addr>>4))&0xF)
153 static struct ip_tunnel *tunnels[4][HASH_SIZE];
155 #define tunnels_r_l (tunnels[3])
156 #define tunnels_r (tunnels[2])
157 #define tunnels_l (tunnels[1])
158 #define tunnels_wc (tunnels[0])
160 static DEFINE_RWLOCK(ipgre_lock);
162 /* Given src, dst and key, find appropriate for input tunnel. */
164 static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
166 unsigned h0 = HASH(remote);
167 unsigned h1 = HASH(key);
168 struct ip_tunnel *t;
170 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
171 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
172 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
173 return t;
176 for (t = tunnels_r[h0^h1]; t; t = t->next) {
177 if (remote == t->parms.iph.daddr) {
178 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
179 return t;
182 for (t = tunnels_l[h1]; t; t = t->next) {
183 if (local == t->parms.iph.saddr ||
184 (local == t->parms.iph.daddr && MULTICAST(local))) {
185 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
186 return t;
189 for (t = tunnels_wc[h1]; t; t = t->next) {
190 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
191 return t;
194 if (ipgre_fb_tunnel_dev->flags&IFF_UP)
195 return netdev_priv(ipgre_fb_tunnel_dev);
196 return NULL;
199 static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
201 u32 remote = t->parms.iph.daddr;
202 u32 local = t->parms.iph.saddr;
203 u32 key = t->parms.i_key;
204 unsigned h = HASH(key);
205 int prio = 0;
207 if (local)
208 prio |= 1;
209 if (remote && !MULTICAST(remote)) {
210 prio |= 2;
211 h ^= HASH(remote);
214 return &tunnels[prio][h];
217 static void ipgre_tunnel_link(struct ip_tunnel *t)
219 struct ip_tunnel **tp = ipgre_bucket(t);
221 t->next = *tp;
222 write_lock_bh(&ipgre_lock);
223 *tp = t;
224 write_unlock_bh(&ipgre_lock);
227 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
229 struct ip_tunnel **tp;
231 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
232 if (t == *tp) {
233 write_lock_bh(&ipgre_lock);
234 *tp = t->next;
235 write_unlock_bh(&ipgre_lock);
236 break;
241 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
243 u32 remote = parms->iph.daddr;
244 u32 local = parms->iph.saddr;
245 u32 key = parms->i_key;
246 struct ip_tunnel *t, **tp, *nt;
247 struct net_device *dev;
248 unsigned h = HASH(key);
249 int prio = 0;
250 char name[IFNAMSIZ];
252 if (local)
253 prio |= 1;
254 if (remote && !MULTICAST(remote)) {
255 prio |= 2;
256 h ^= HASH(remote);
258 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
259 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
260 if (key == t->parms.i_key)
261 return t;
264 if (!create)
265 return NULL;
267 if (parms->name[0])
268 strlcpy(name, parms->name, IFNAMSIZ);
269 else {
270 int i;
271 for (i=1; i<100; i++) {
272 sprintf(name, "gre%d", i);
273 if (__dev_get_by_name(name) == NULL)
274 break;
276 if (i==100)
277 goto failed;
280 if (parms->iph.id == htons(ETH_P_BRIDGE))
281 dev = alloc_netdev(sizeof(*t), name, ipgre_ether_tunnel_setup);
282 else
283 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
284 if (!dev)
285 return NULL;
287 dev->init = ipgre_tunnel_init;
288 nt = netdev_priv(dev);
289 nt->parms = *parms;
291 if (register_netdevice(dev) < 0) {
292 free_netdev(dev);
293 goto failed;
296 dev_hold(dev);
297 ipgre_tunnel_link(nt);
298 return nt;
300 failed:
301 return NULL;
304 static void ipgre_tunnel_uninit(struct net_device *dev)
306 ipgre_tunnel_unlink(netdev_priv(dev));
307 dev_put(dev);
311 static void ipgre_err(struct sk_buff *skb, u32 info)
313 #ifndef I_WISH_WORLD_WERE_PERFECT
315 /* It is not :-( All the routers (except for Linux) return only
316 8 bytes of packet payload. It means, that precise relaying of
317 ICMP in the real Internet is absolutely infeasible.
319 Moreover, Cisco "wise men" put GRE key to the third word
320 in GRE header. It makes impossible maintaining even soft state for keyed
321 GRE tunnels with enabled checksum. Tell them "thank you".
323 Well, I wonder, rfc1812 was written by Cisco employee,
324 what the hell these idiots break standrads established
325 by themself???
328 struct iphdr *iph = (struct iphdr*)skb->data;
329 u16 *p = (u16*)(skb->data+(iph->ihl<<2));
330 int grehlen = (iph->ihl<<2) + 4;
331 int type = skb->h.icmph->type;
332 int code = skb->h.icmph->code;
333 struct ip_tunnel *t;
334 u16 flags;
336 flags = p[0];
337 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
338 if (flags&(GRE_VERSION|GRE_ROUTING))
339 return;
340 if (flags&GRE_KEY) {
341 grehlen += 4;
342 if (flags&GRE_CSUM)
343 grehlen += 4;
347 /* If only 8 bytes returned, keyed message will be dropped here */
348 if (skb_headlen(skb) < grehlen)
349 return;
351 switch (type) {
352 default:
353 case ICMP_PARAMETERPROB:
354 return;
356 case ICMP_DEST_UNREACH:
357 switch (code) {
358 case ICMP_SR_FAILED:
359 case ICMP_PORT_UNREACH:
360 /* Impossible event. */
361 return;
362 case ICMP_FRAG_NEEDED:
363 /* Soft state for pmtu is maintained by IP core. */
364 return;
365 default:
366 /* All others are translated to HOST_UNREACH.
367 rfc2003 contains "deep thoughts" about NET_UNREACH,
368 I believe they are just ether pollution. --ANK
370 break;
372 break;
373 case ICMP_TIME_EXCEEDED:
374 if (code != ICMP_EXC_TTL)
375 return;
376 break;
379 read_lock(&ipgre_lock);
380 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
381 if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
382 goto out;
384 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
385 goto out;
387 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
388 t->err_count++;
389 else
390 t->err_count = 1;
391 t->err_time = jiffies;
392 out:
393 read_unlock(&ipgre_lock);
394 return;
395 #else
396 struct iphdr *iph = (struct iphdr*)dp;
397 struct iphdr *eiph;
398 u16 *p = (u16*)(dp+(iph->ihl<<2));
399 int type = skb->h.icmph->type;
400 int code = skb->h.icmph->code;
401 int rel_type = 0;
402 int rel_code = 0;
403 __be32 rel_info = 0;
404 __u32 n = 0;
405 u16 flags;
406 int grehlen = (iph->ihl<<2) + 4;
407 struct sk_buff *skb2;
408 struct flowi fl;
409 struct rtable *rt;
411 if (p[1] != htons(ETH_P_IP))
412 return;
414 flags = p[0];
415 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
416 if (flags&(GRE_VERSION|GRE_ROUTING))
417 return;
418 if (flags&GRE_CSUM)
419 grehlen += 4;
420 if (flags&GRE_KEY)
421 grehlen += 4;
422 if (flags&GRE_SEQ)
423 grehlen += 4;
425 if (len < grehlen + sizeof(struct iphdr))
426 return;
427 eiph = (struct iphdr*)(dp + grehlen);
429 switch (type) {
430 default:
431 return;
432 case ICMP_PARAMETERPROB:
433 n = ntohl(skb->h.icmph->un.gateway) >> 24;
434 if (n < (iph->ihl<<2))
435 return;
437 /* So... This guy found something strange INSIDE encapsulated
438 packet. Well, he is fool, but what can we do ?
440 rel_type = ICMP_PARAMETERPROB;
441 n -= grehlen;
442 rel_info = htonl(n << 24);
443 break;
445 case ICMP_DEST_UNREACH:
446 switch (code) {
447 case ICMP_SR_FAILED:
448 case ICMP_PORT_UNREACH:
449 /* Impossible event. */
450 return;
451 case ICMP_FRAG_NEEDED:
452 /* And it is the only really necessary thing :-) */
453 n = ntohs(skb->h.icmph->un.frag.mtu);
454 if (n < grehlen+68)
455 return;
456 n -= grehlen;
457 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
458 if (n > ntohs(eiph->tot_len))
459 return;
460 rel_info = htonl(n);
461 break;
462 default:
463 /* All others are translated to HOST_UNREACH.
464 rfc2003 contains "deep thoughts" about NET_UNREACH,
465 I believe, it is just ether pollution. --ANK
467 rel_type = ICMP_DEST_UNREACH;
468 rel_code = ICMP_HOST_UNREACH;
469 break;
471 break;
472 case ICMP_TIME_EXCEEDED:
473 if (code != ICMP_EXC_TTL)
474 return;
475 break;
478 /* Prepare fake skb to feed it to icmp_send */
479 skb2 = skb_clone(skb, GFP_ATOMIC);
480 if (skb2 == NULL)
481 return;
482 dst_release(skb2->dst);
483 skb2->dst = NULL;
484 skb_pull(skb2, skb->data - (u8*)eiph);
485 skb2->nh.raw = skb2->data;
487 /* Try to guess incoming interface */
488 memset(&fl, 0, sizeof(fl));
489 fl.fl4_dst = eiph->saddr;
490 fl.fl4_tos = RT_TOS(eiph->tos);
491 fl.proto = IPPROTO_GRE;
492 if (ip_route_output_key(&rt, &fl)) {
493 kfree_skb(skb2);
494 return;
496 skb2->dev = rt->u.dst.dev;
498 /* route "incoming" packet */
499 if (rt->rt_flags&RTCF_LOCAL) {
500 ip_rt_put(rt);
501 rt = NULL;
502 fl.fl4_dst = eiph->daddr;
503 fl.fl4_src = eiph->saddr;
504 fl.fl4_tos = eiph->tos;
505 if (ip_route_output_key(&rt, &fl) ||
506 rt->u.dst.dev->type != ARPHRD_IPGRE) {
507 ip_rt_put(rt);
508 kfree_skb(skb2);
509 return;
511 } else {
512 ip_rt_put(rt);
513 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
514 skb2->dst->dev->type != ARPHRD_IPGRE) {
515 kfree_skb(skb2);
516 return;
520 /* change mtu on this route */
521 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
522 if (n > dst_mtu(skb2->dst)) {
523 kfree_skb(skb2);
524 return;
526 skb2->dst->ops->update_pmtu(skb2->dst, n);
527 } else if (type == ICMP_TIME_EXCEEDED) {
528 struct ip_tunnel *t = netdev_priv(skb2->dev);
529 if (t->parms.iph.ttl) {
530 rel_type = ICMP_DEST_UNREACH;
531 rel_code = ICMP_HOST_UNREACH;
535 icmp_send(skb2, rel_type, rel_code, rel_info);
536 kfree_skb(skb2);
537 #endif
540 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
542 if (INET_ECN_is_ce(iph->tos)) {
543 if (skb->protocol == htons(ETH_P_IP)) {
544 IP_ECN_set_ce(skb->nh.iph);
545 } else if (skb->protocol == htons(ETH_P_IPV6)) {
546 IP6_ECN_set_ce(skb->nh.ipv6h);
551 static inline u8
552 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
554 u8 inner = 0;
555 if (skb->protocol == htons(ETH_P_IP))
556 inner = old_iph->tos;
557 else if (skb->protocol == htons(ETH_P_IPV6))
558 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
559 return INET_ECN_encapsulate(tos, inner);
562 static __be16 ipgre_type_trans(struct sk_buff *skb, struct net_device *dev)
564 if (skb->protocol == htons(ETH_P_BRIDGE)) {
565 if (!pskb_may_pull(skb, ETH_HLEN))
566 return 0;
567 return eth_type_trans(skb, dev);
569 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
570 else if (skb->protocol == htons(LLC_SAP_BSPAN)) {
571 br_stp_rcv_raw(skb, dev);
572 return 0;
574 #endif
576 return 0;
579 static int ipgre_rcv(struct sk_buff *skb)
581 struct iphdr *iph;
582 u8 *h;
583 u16 flags;
584 u16 csum = 0;
585 u32 key = 0;
586 u32 seqno = 0;
587 struct ip_tunnel *tunnel;
588 int offset = 4;
590 if (!pskb_may_pull(skb, 16))
591 goto drop_nolock;
593 iph = skb->nh.iph;
594 h = skb->data;
595 flags = *(u16*)h;
597 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
598 /* - Version must be 0.
599 - We do not support routing headers.
601 if (flags&(GRE_VERSION|GRE_ROUTING))
602 goto drop_nolock;
604 if (flags&GRE_CSUM) {
605 switch (skb->ip_summed) {
606 case CHECKSUM_COMPLETE:
607 csum = (u16)csum_fold(skb->csum);
608 if (!csum)
609 break;
610 /* fall through */
611 case CHECKSUM_NONE:
612 skb->csum = 0;
613 csum = __skb_checksum_complete(skb);
614 skb->ip_summed = CHECKSUM_COMPLETE;
616 offset += 4;
618 if (flags&GRE_KEY) {
619 key = *(u32*)(h + offset);
620 offset += 4;
622 if (flags&GRE_SEQ) {
623 seqno = ntohl(*(u32*)(h + offset));
624 offset += 4;
628 read_lock(&ipgre_lock);
629 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
630 secpath_reset(skb);
632 skb->protocol = *(u16*)(h + 2);
633 /* WCCP version 1 and 2 protocol decoding.
634 * - Change protocol to IP
635 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
637 if (flags == 0 &&
638 skb->protocol == htons(ETH_P_WCCP)) {
639 skb->protocol = htons(ETH_P_IP);
640 if ((*(h + offset) & 0xF0) != 0x40)
641 offset += 4;
644 skb->mac.raw = skb->nh.raw;
645 skb->nh.raw = __pskb_pull(skb, offset);
646 skb_postpull_rcsum(skb, skb->h.raw, offset);
647 skb->pkt_type = PACKET_HOST;
648 #ifdef CONFIG_NET_IPGRE_BROADCAST
649 if (MULTICAST(iph->daddr)) {
650 /* Looped back packet, drop it! */
651 if (((struct rtable*)skb->dst)->fl.iif == 0)
652 goto drop;
653 tunnel->stat.multicast++;
654 skb->pkt_type = PACKET_BROADCAST;
656 #endif
658 if (((flags&GRE_CSUM) && csum) ||
659 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
660 tunnel->stat.rx_crc_errors++;
661 tunnel->stat.rx_errors++;
662 goto drop;
664 if (tunnel->parms.i_flags&GRE_SEQ) {
665 if (!(flags&GRE_SEQ) ||
666 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
667 tunnel->stat.rx_fifo_errors++;
668 tunnel->stat.rx_errors++;
669 goto drop;
671 tunnel->i_seqno = seqno + 1;
673 if (tunnel->dev->type == ARPHRD_ETHER) {
674 skb->protocol = ipgre_type_trans(skb, tunnel->dev);
675 if (!skb->protocol) {
676 tunnel->stat.rx_errors++;
677 goto drop;
680 tunnel->stat.rx_packets++;
681 tunnel->stat.rx_bytes += skb->len;
682 skb->dev = tunnel->dev;
683 dst_release(skb->dst);
684 skb->dst = NULL;
685 nf_reset(skb);
686 ipgre_ecn_decapsulate(iph, skb);
687 netif_rx(skb);
688 read_unlock(&ipgre_lock);
689 return(0);
691 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
693 drop:
694 read_unlock(&ipgre_lock);
695 drop_nolock:
696 kfree_skb(skb);
697 return(0);
700 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
702 struct ip_tunnel *tunnel = netdev_priv(dev);
703 struct net_device_stats *stats = &tunnel->stat;
704 struct iphdr *old_iph = skb->nh.iph;
705 struct iphdr *tiph;
706 u8 tos;
707 u16 df;
708 struct rtable *rt; /* Route to the other host */
709 struct net_device *tdev; /* Device to other host */
710 struct iphdr *iph; /* Our new IP header */
711 int max_headroom; /* The extra header space needed */
712 int gre_hlen;
713 int push_hlen;
714 u32 dst;
715 int mtu;
717 if (tunnel->recursion++) {
718 tunnel->stat.collisions++;
719 goto tx_error;
722 if (dev->type == ARPHRD_ETHER) {
723 skb->protocol = htons(ETH_P_BRIDGE);
724 gre_hlen = tunnel->hlen - ETH_HLEN;
725 push_hlen = gre_hlen;
726 tiph = &tunnel->parms.iph;
727 } else if (dev->hard_header) {
728 gre_hlen = tunnel->hlen;
729 push_hlen = 0;
730 tiph = (struct iphdr*)skb->data;
731 } else {
732 gre_hlen = tunnel->hlen;
733 push_hlen = gre_hlen;
734 tiph = &tunnel->parms.iph;
737 if ((dst = tiph->daddr) == 0) {
738 /* NBMA tunnel */
740 if (skb->dst == NULL) {
741 tunnel->stat.tx_fifo_errors++;
742 goto tx_error;
745 if (skb->protocol == htons(ETH_P_IP)) {
746 rt = (struct rtable*)skb->dst;
747 if ((dst = rt->rt_gateway) == 0)
748 goto tx_error_icmp;
750 #ifdef CONFIG_IPV6
751 else if (skb->protocol == htons(ETH_P_IPV6)) {
752 struct in6_addr *addr6;
753 int addr_type;
754 struct neighbour *neigh = skb->dst->neighbour;
756 if (neigh == NULL)
757 goto tx_error;
759 addr6 = (struct in6_addr*)&neigh->primary_key;
760 addr_type = ipv6_addr_type(addr6);
762 if (addr_type == IPV6_ADDR_ANY) {
763 addr6 = &skb->nh.ipv6h->daddr;
764 addr_type = ipv6_addr_type(addr6);
767 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
768 goto tx_error_icmp;
770 dst = addr6->s6_addr32[3];
772 #endif
773 else
774 goto tx_error;
777 tos = tiph->tos;
778 if (tos&1) {
779 if (skb->protocol == htons(ETH_P_IP))
780 tos = old_iph->tos;
781 tos &= ~1;
785 struct flowi fl = { .oif = tunnel->parms.link,
786 .nl_u = { .ip4_u =
787 { .daddr = dst,
788 .saddr = tiph->saddr,
789 .tos = RT_TOS(tos) } },
790 .proto = IPPROTO_GRE };
791 if (ip_route_output_key(&rt, &fl)) {
792 tunnel->stat.tx_carrier_errors++;
793 goto tx_error;
796 tdev = rt->u.dst.dev;
798 if (tdev == dev) {
799 ip_rt_put(rt);
800 tunnel->stat.collisions++;
801 goto tx_error;
804 df = tiph->frag_off;
805 if (df)
806 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
807 else
808 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
810 if (skb->dst)
811 skb->dst->ops->update_pmtu(skb->dst, mtu);
813 if (skb->protocol == htons(ETH_P_IP)) {
814 df |= (old_iph->frag_off&htons(IP_DF));
816 if ((old_iph->frag_off&htons(IP_DF)) &&
817 mtu < ntohs(old_iph->tot_len)) {
818 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
819 ip_rt_put(rt);
820 goto tx_error;
823 #ifdef CONFIG_IPV6
824 else if (skb->protocol == htons(ETH_P_IPV6)) {
825 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
827 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
828 if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
829 rt6->rt6i_dst.plen == 128) {
830 rt6->rt6i_flags |= RTF_MODIFIED;
831 skb->dst->metrics[RTAX_MTU-1] = mtu;
835 if (mtu >= IPV6_MIN_MTU &&
836 mtu < skb->len - tunnel->hlen + push_hlen) {
837 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
838 ip_rt_put(rt);
839 goto tx_error;
842 #endif
844 if (tunnel->err_count > 0) {
845 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
846 tunnel->err_count--;
848 dst_link_failure(skb);
849 } else
850 tunnel->err_count = 0;
853 max_headroom = LL_RESERVED_SPACE(tdev) + push_hlen;
855 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
856 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
857 if (!new_skb) {
858 ip_rt_put(rt);
859 stats->tx_dropped++;
860 dev_kfree_skb(skb);
861 tunnel->recursion--;
862 return 0;
864 if (skb->sk)
865 skb_set_owner_w(new_skb, skb->sk);
866 dev_kfree_skb(skb);
867 skb = new_skb;
868 old_iph = skb->nh.iph;
871 skb->h.raw = skb->nh.raw;
872 skb->nh.raw = skb_push(skb, push_hlen);
873 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
874 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
875 IPSKB_REROUTED);
876 dst_release(skb->dst);
877 skb->dst = &rt->u.dst;
880 * Push down and install the IPIP header.
883 iph = skb->nh.iph;
884 iph->version = 4;
885 iph->ihl = sizeof(struct iphdr) >> 2;
886 iph->frag_off = df;
887 iph->protocol = IPPROTO_GRE;
888 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
889 iph->daddr = rt->rt_dst;
890 iph->saddr = rt->rt_src;
892 if ((iph->ttl = tiph->ttl) == 0) {
893 if (skb->protocol == htons(ETH_P_IP))
894 iph->ttl = old_iph->ttl;
895 #ifdef CONFIG_IPV6
896 else if (skb->protocol == htons(ETH_P_IPV6))
897 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
898 #endif
899 else
900 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
903 ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
904 ((u16*)(iph+1))[1] = skb->protocol;
906 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
907 u32 *ptr = (u32*)(((u8*)iph) + gre_hlen - 4);
909 if (tunnel->parms.o_flags&GRE_SEQ) {
910 ++tunnel->o_seqno;
911 *ptr = htonl(tunnel->o_seqno);
912 ptr--;
914 if (tunnel->parms.o_flags&GRE_KEY) {
915 *ptr = tunnel->parms.o_key;
916 ptr--;
918 if (tunnel->parms.o_flags&GRE_CSUM) {
919 *ptr = 0;
920 *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
924 nf_reset(skb);
926 IPTUNNEL_XMIT();
927 tunnel->recursion--;
928 return 0;
930 tx_error_icmp:
931 dst_link_failure(skb);
933 tx_error:
934 stats->tx_errors++;
935 dev_kfree_skb(skb);
936 tunnel->recursion--;
937 return 0;
940 static int
941 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
943 int err = 0;
944 struct ip_tunnel_parm p;
945 struct ip_tunnel *t;
947 switch (cmd) {
948 case SIOCGETTUNNEL:
949 t = NULL;
950 if (dev == ipgre_fb_tunnel_dev) {
951 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
952 err = -EFAULT;
953 break;
955 t = ipgre_tunnel_locate(&p, 0);
957 if (t == NULL)
958 t = netdev_priv(dev);
959 memcpy(&p, &t->parms, sizeof(p));
960 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
961 err = -EFAULT;
962 break;
964 case SIOCADDTUNNEL:
965 case SIOCCHGTUNNEL:
966 err = -EPERM;
967 if (!capable(CAP_NET_ADMIN))
968 goto done;
970 err = -EFAULT;
971 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
972 goto done;
974 err = -EINVAL;
975 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
976 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
977 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
978 goto done;
979 if (p.iph.id != 0 && p.iph.id != htons(ETH_P_BRIDGE))
980 goto done;
981 if (p.iph.ttl)
982 p.iph.frag_off |= htons(IP_DF);
984 if (!(p.i_flags&GRE_KEY))
985 p.i_key = 0;
986 if (!(p.o_flags&GRE_KEY))
987 p.o_key = 0;
989 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
991 if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
992 if (t != NULL) {
993 if (t->dev != dev) {
994 err = -EEXIST;
995 break;
997 } else {
998 unsigned nflags=0;
1000 t = netdev_priv(dev);
1002 if (t->dev->type == ARPHRD_ETHER)
1003 nflags = IFF_BROADCAST;
1004 else if (MULTICAST(p.iph.daddr))
1005 nflags = IFF_BROADCAST;
1006 else if (p.iph.daddr)
1007 nflags = IFF_POINTOPOINT;
1009 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1010 err = -EINVAL;
1011 break;
1013 ipgre_tunnel_unlink(t);
1014 t->parms.iph.saddr = p.iph.saddr;
1015 t->parms.iph.daddr = p.iph.daddr;
1016 t->parms.i_key = p.i_key;
1017 t->parms.o_key = p.o_key;
1018 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1019 memcpy(dev->broadcast, &p.iph.daddr, 4);
1020 ipgre_tunnel_link(t);
1021 netdev_state_change(dev);
1025 if (t) {
1026 err = 0;
1027 if (cmd == SIOCCHGTUNNEL) {
1028 t->parms.iph.ttl = p.iph.ttl;
1029 t->parms.iph.tos = p.iph.tos;
1030 t->parms.iph.frag_off = p.iph.frag_off;
1032 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1033 err = -EFAULT;
1034 } else
1035 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1036 break;
1038 case SIOCDELTUNNEL:
1039 err = -EPERM;
1040 if (!capable(CAP_NET_ADMIN))
1041 goto done;
1043 if (dev == ipgre_fb_tunnel_dev) {
1044 err = -EFAULT;
1045 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1046 goto done;
1047 err = -ENOENT;
1048 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
1049 goto done;
1050 err = -EPERM;
1051 if (t == netdev_priv(ipgre_fb_tunnel_dev))
1052 goto done;
1053 dev = t->dev;
1055 err = unregister_netdevice(dev);
1056 break;
1058 default:
1059 err = -EINVAL;
1062 done:
1063 return err;
1066 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1068 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1071 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1073 struct ip_tunnel *tunnel = netdev_priv(dev);
1074 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1075 return -EINVAL;
1076 dev->mtu = new_mtu;
1077 return 0;
1080 #ifdef CONFIG_NET_IPGRE_BROADCAST
1081 /* Nice toy. Unfortunately, useless in real life :-)
1082 It allows to construct virtual multiprotocol broadcast "LAN"
1083 over the Internet, provided multicast routing is tuned.
1086 I have no idea was this bicycle invented before me,
1087 so that I had to set ARPHRD_IPGRE to a random value.
1088 I have an impression, that Cisco could make something similar,
1089 but this feature is apparently missing in IOS<=11.2(8).
1091 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1092 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1094 ping -t 255 224.66.66.66
1096 If nobody answers, mbone does not work.
1098 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1099 ip addr add 10.66.66.<somewhat>/24 dev Universe
1100 ifconfig Universe up
1101 ifconfig Universe add fe80::<Your_real_addr>/10
1102 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1103 ftp 10.66.66.66
1105 ftp fec0:6666:6666::193.233.7.65
1110 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1111 void *daddr, void *saddr, unsigned len)
1113 struct ip_tunnel *t = netdev_priv(dev);
1114 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1115 u16 *p = (u16*)(iph+1);
1117 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1118 p[0] = t->parms.o_flags;
1119 p[1] = htons(type);
1122 * Set the source hardware address.
1125 if (saddr)
1126 memcpy(&iph->saddr, saddr, 4);
1128 if (daddr) {
1129 memcpy(&iph->daddr, daddr, 4);
1130 return t->hlen;
1132 if (iph->daddr && !MULTICAST(iph->daddr))
1133 return t->hlen;
1135 return -t->hlen;
1138 static int ipgre_open(struct net_device *dev)
1140 struct ip_tunnel *t = netdev_priv(dev);
1142 if (MULTICAST(t->parms.iph.daddr)) {
1143 struct flowi fl = { .oif = t->parms.link,
1144 .nl_u = { .ip4_u =
1145 { .daddr = t->parms.iph.daddr,
1146 .saddr = t->parms.iph.saddr,
1147 .tos = RT_TOS(t->parms.iph.tos) } },
1148 .proto = IPPROTO_GRE };
1149 struct rtable *rt;
1150 if (ip_route_output_key(&rt, &fl))
1151 return -EADDRNOTAVAIL;
1152 dev = rt->u.dst.dev;
1153 ip_rt_put(rt);
1154 if (__in_dev_get_rtnl(dev) == NULL)
1155 return -EADDRNOTAVAIL;
1156 t->mlink = dev->ifindex;
1157 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1159 return 0;
1162 static int ipgre_close(struct net_device *dev)
1164 struct ip_tunnel *t = netdev_priv(dev);
1165 if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1166 struct in_device *in_dev = inetdev_by_index(t->mlink);
1167 if (in_dev) {
1168 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1169 in_dev_put(in_dev);
1172 return 0;
1175 #endif
1177 static void ipgre_tunnel_setup(struct net_device *dev)
1179 SET_MODULE_OWNER(dev);
1180 dev->uninit = ipgre_tunnel_uninit;
1181 dev->destructor = free_netdev;
1182 dev->hard_start_xmit = ipgre_tunnel_xmit;
1183 dev->get_stats = ipgre_tunnel_get_stats;
1184 dev->do_ioctl = ipgre_tunnel_ioctl;
1185 dev->change_mtu = ipgre_tunnel_change_mtu;
1187 dev->type = ARPHRD_IPGRE;
1188 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1189 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1190 dev->flags = IFF_NOARP;
1191 dev->iflink = 0;
1192 dev->addr_len = 4;
1195 static void ipgre_ether_tunnel_setup(struct net_device *dev)
1197 ether_setup(dev);
1199 SET_MODULE_OWNER(dev);
1200 dev->uninit = ipgre_tunnel_uninit;
1201 dev->destructor = free_netdev;
1202 dev->hard_start_xmit = ipgre_tunnel_xmit;
1203 dev->get_stats = ipgre_tunnel_get_stats;
1204 dev->do_ioctl = ipgre_tunnel_ioctl;
1207 static int ipgre_tunnel_init(struct net_device *dev)
1209 struct net_device *tdev = NULL;
1210 struct ip_tunnel *tunnel;
1211 struct iphdr *iph;
1212 int hlen = LL_MAX_HEADER;
1213 int mtu = ETH_DATA_LEN;
1214 int addend = sizeof(struct iphdr) + 4;
1216 tunnel = netdev_priv(dev);
1217 iph = &tunnel->parms.iph;
1219 tunnel->dev = dev;
1220 strcpy(tunnel->parms.name, dev->name);
1222 if (dev->type == ARPHRD_ETHER)
1223 random_ether_addr(dev->dev_addr);
1224 else {
1225 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1226 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1229 if (dev->type == ARPHRD_ETHER)
1230 dev->flags |= IFF_BROADCAST;
1231 #ifdef CONFIG_NET_IPGRE_BROADCAST
1232 else if (MULTICAST(iph->daddr)) {
1233 if (!iph->saddr)
1234 return -EINVAL;
1235 dev->flags = IFF_BROADCAST;
1236 dev->hard_header = ipgre_header;
1237 dev->open = ipgre_open;
1238 dev->stop = ipgre_close;
1240 #endif
1241 else if (iph->daddr)
1242 dev->flags |= IFF_POINTOPOINT;
1244 /* Guess output device to choose reasonable mtu and hard_header_len */
1246 if (iph->daddr) {
1247 struct flowi fl = { .oif = tunnel->parms.link,
1248 .nl_u = { .ip4_u =
1249 { .daddr = iph->daddr,
1250 .saddr = iph->saddr,
1251 .tos = RT_TOS(iph->tos) } },
1252 .proto = IPPROTO_GRE };
1253 struct rtable *rt;
1254 if (!ip_route_output_key(&rt, &fl)) {
1255 tdev = rt->u.dst.dev;
1256 ip_rt_put(rt);
1260 if (!tdev && tunnel->parms.link)
1261 tdev = __dev_get_by_index(tunnel->parms.link);
1263 if (tdev) {
1264 hlen = tdev->hard_header_len;
1265 mtu = tdev->mtu;
1267 dev->iflink = tunnel->parms.link;
1269 /* Precalculate GRE options length */
1270 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1271 if (tunnel->parms.o_flags&GRE_CSUM)
1272 addend += 4;
1273 if (tunnel->parms.o_flags&GRE_KEY)
1274 addend += 4;
1275 if (tunnel->parms.o_flags&GRE_SEQ)
1276 addend += 4;
1278 if (dev->type == ARPHRD_ETHER)
1279 addend += ETH_HLEN;
1280 dev->hard_header_len = hlen + addend;
1281 dev->mtu = mtu - addend;
1282 tunnel->hlen = addend;
1283 return 0;
1286 static int __init ipgre_fb_tunnel_init(struct net_device *dev)
1288 struct ip_tunnel *tunnel = netdev_priv(dev);
1289 struct iphdr *iph = &tunnel->parms.iph;
1291 tunnel->dev = dev;
1292 strcpy(tunnel->parms.name, dev->name);
1294 iph->version = 4;
1295 iph->protocol = IPPROTO_GRE;
1296 iph->ihl = 5;
1297 tunnel->hlen = sizeof(struct iphdr) + 4;
1299 dev_hold(dev);
1300 tunnels_wc[0] = tunnel;
1301 return 0;
1305 static struct net_protocol ipgre_protocol = {
1306 .handler = ipgre_rcv,
1307 .err_handler = ipgre_err,
1312 * And now the modules code and kernel interface.
1315 static int __init ipgre_init(void)
1317 int err;
1319 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1321 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1322 printk(KERN_INFO "ipgre init: can't add protocol\n");
1323 return -EAGAIN;
1326 ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1327 ipgre_tunnel_setup);
1328 if (!ipgre_fb_tunnel_dev) {
1329 err = -ENOMEM;
1330 goto err1;
1333 ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1335 if ((err = register_netdev(ipgre_fb_tunnel_dev)))
1336 goto err2;
1337 out:
1338 return err;
1339 err2:
1340 free_netdev(ipgre_fb_tunnel_dev);
1341 err1:
1342 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1343 goto out;
1346 static void __exit ipgre_destroy_tunnels(void)
1348 int prio;
1350 for (prio = 0; prio < 4; prio++) {
1351 int h;
1352 for (h = 0; h < HASH_SIZE; h++) {
1353 struct ip_tunnel *t;
1354 while ((t = tunnels[prio][h]) != NULL)
1355 unregister_netdevice(t->dev);
1360 static void __exit ipgre_fini(void)
1362 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1363 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1365 rtnl_lock();
1366 ipgre_destroy_tunnels();
1367 rtnl_unlock();
1370 module_init(ipgre_init);
1371 module_exit(ipgre_fini);
1372 MODULE_LICENSE("GPL");