Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / net / ipv4 / ip_gre.c
blob0c924a79392c25c45905c8142927d8bc2fa8ac9f
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/inet_ecn.h>
41 #ifdef CONFIG_IPV6
42 #include <net/ipv6.h>
43 #include <net/ip6_fib.h>
44 #include <net/ip6_route.h>
45 #endif
48 Problems & solutions
49 --------------------
51 1. The most important issue is detecting local dead loops.
52 They would cause complete host lockup in transmit, which
53 would be "resolved" by stack overflow or, if queueing is enabled,
54 with infinite looping in net_bh.
56 We cannot track such dead loops during route installation,
57 it is infeasible task. The most general solutions would be
58 to keep skb->encapsulation counter (sort of local ttl),
59 and silently drop packet when it expires. It is the best
60 solution, but it supposes maintaing new variable in ALL
61 skb, even if no tunneling is used.
63 Current solution: t->recursion lock breaks dead loops. It looks
64 like dev->tbusy flag, but I preferred new variable, because
65 the semantics is different. One day, when hard_start_xmit
66 will be multithreaded we will have to use skb->encapsulation.
70 2. Networking dead loops would not kill routers, but would really
71 kill network. IP hop limit plays role of "t->recursion" in this case,
72 if we copy it from packet being encapsulated to upper header.
73 It is very good solution, but it introduces two problems:
75 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
76 do not work over tunnels.
77 - traceroute does not work. I planned to relay ICMP from tunnel,
78 so that this problem would be solved and traceroute output
79 would even more informative. This idea appeared to be wrong:
80 only Linux complies to rfc1812 now (yes, guys, Linux is the only
81 true router now :-)), all routers (at least, in neighbourhood of mine)
82 return only 8 bytes of payload. It is the end.
84 Hence, if we want that OSPF worked or traceroute said something reasonable,
85 we should search for another solution.
87 One of them is to parse packet trying to detect inner encapsulation
88 made by our node. It is difficult or even impossible, especially,
89 taking into account fragmentation. TO be short, tt is not solution at all.
91 Current solution: The solution was UNEXPECTEDLY SIMPLE.
92 We force DF flag on tunnels with preconfigured hop limit,
93 that is ALL. :-) Well, it does not remove the problem completely,
94 but exponential growth of network traffic is changed to linear
95 (branches, that exceed pmtu are pruned) and tunnel mtu
96 fastly degrades to value <68, where looping stops.
97 Yes, it is not good if there exists a router in the loop,
98 which does not force DF, even when encapsulating packets have DF set.
99 But it is not our problem! Nobody could accuse us, we made
100 all that we could make. Even if it is your gated who injected
101 fatal route to network, even if it were you who configured
102 fatal static route: you are innocent. :-)
106 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
107 practically identical code. It would be good to glue them
108 together, but it is not very evident, how to make them modular.
109 sit is integral part of IPv6, ipip and gre are naturally modular.
110 We could extract common parts (hash table, ioctl etc)
111 to a separate module (ip_tunnel.c).
113 Alexey Kuznetsov.
116 static int ipgre_tunnel_init(struct net_device *dev);
118 /* Fallback tunnel: no source, no destination, no key, no options */
120 static int ipgre_fb_tunnel_init(struct net_device *dev);
122 static struct net_device ipgre_fb_tunnel_dev = {
123 "gre0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
126 static struct ip_tunnel ipgre_fb_tunnel = {
127 NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
130 /* Tunnel hash table */
133 4 hash tables:
135 3: (remote,local)
136 2: (remote,*)
137 1: (*,local)
138 0: (*,*)
140 We require exact key match i.e. if a key is present in packet
141 it will match only tunnel with the same key; if it is not present,
142 it will match only keyless tunnel.
144 All keysless packets, if not matched configured keyless tunnels
145 will match fallback tunnel.
148 #define HASH_SIZE 16
149 #define HASH(addr) ((addr^(addr>>4))&0xF)
151 static struct ip_tunnel *tunnels[4][HASH_SIZE];
153 #define tunnels_r_l (tunnels[3])
154 #define tunnels_r (tunnels[2])
155 #define tunnels_l (tunnels[1])
156 #define tunnels_wc (tunnels[0])
158 static rwlock_t ipgre_lock = RW_LOCK_UNLOCKED;
160 /* Given src, dst and key, find approriate for input tunnel. */
162 static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
164 unsigned h0 = HASH(remote);
165 unsigned h1 = HASH(key);
166 struct ip_tunnel *t;
168 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
169 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
170 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
171 return t;
174 for (t = tunnels_r[h0^h1]; t; t = t->next) {
175 if (remote == t->parms.iph.daddr) {
176 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177 return t;
180 for (t = tunnels_l[h1]; t; t = t->next) {
181 if (local == t->parms.iph.saddr ||
182 (local == t->parms.iph.daddr && MULTICAST(local))) {
183 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
184 return t;
187 for (t = tunnels_wc[h1]; t; t = t->next) {
188 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
189 return t;
191 if (ipgre_fb_tunnel_dev.flags&IFF_UP)
192 return &ipgre_fb_tunnel;
193 return NULL;
196 static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
198 u32 remote = t->parms.iph.daddr;
199 u32 local = t->parms.iph.saddr;
200 u32 key = t->parms.i_key;
201 unsigned h = HASH(key);
202 int prio = 0;
204 if (local)
205 prio |= 1;
206 if (remote && !MULTICAST(remote)) {
207 prio |= 2;
208 h ^= HASH(remote);
211 return &tunnels[prio][h];
214 static void ipgre_tunnel_link(struct ip_tunnel *t)
216 struct ip_tunnel **tp = ipgre_bucket(t);
218 t->next = *tp;
219 write_lock_bh(&ipgre_lock);
220 *tp = t;
221 write_unlock_bh(&ipgre_lock);
224 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
226 struct ip_tunnel **tp;
228 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
229 if (t == *tp) {
230 write_lock_bh(&ipgre_lock);
231 *tp = t->next;
232 write_unlock_bh(&ipgre_lock);
233 break;
238 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
240 u32 remote = parms->iph.daddr;
241 u32 local = parms->iph.saddr;
242 u32 key = parms->i_key;
243 struct ip_tunnel *t, **tp, *nt;
244 struct net_device *dev;
245 unsigned h = HASH(key);
246 int prio = 0;
248 if (local)
249 prio |= 1;
250 if (remote && !MULTICAST(remote)) {
251 prio |= 2;
252 h ^= HASH(remote);
254 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
255 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
256 if (key == t->parms.i_key)
257 return t;
260 if (!create)
261 return NULL;
263 MOD_INC_USE_COUNT;
264 dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
265 if (dev == NULL) {
266 MOD_DEC_USE_COUNT;
267 return NULL;
269 memset(dev, 0, sizeof(*dev) + sizeof(*t));
270 dev->priv = (void*)(dev+1);
271 nt = (struct ip_tunnel*)dev->priv;
272 nt->dev = dev;
273 dev->init = ipgre_tunnel_init;
274 dev->features |= NETIF_F_DYNALLOC;
275 memcpy(&nt->parms, parms, sizeof(*parms));
276 strcpy(dev->name, nt->parms.name);
277 if (dev->name[0] == 0) {
278 int i;
279 for (i=1; i<100; i++) {
280 sprintf(dev->name, "gre%d", i);
281 if (__dev_get_by_name(dev->name) == NULL)
282 break;
284 if (i==100)
285 goto failed;
286 memcpy(parms->name, dev->name, IFNAMSIZ);
288 if (register_netdevice(dev) < 0)
289 goto failed;
291 dev_hold(dev);
292 ipgre_tunnel_link(nt);
293 /* Do not decrement MOD_USE_COUNT here. */
294 return nt;
296 failed:
297 kfree(dev);
298 MOD_DEC_USE_COUNT;
299 return NULL;
302 static void ipgre_tunnel_destructor(struct net_device *dev)
304 if (dev != &ipgre_fb_tunnel_dev) {
305 MOD_DEC_USE_COUNT;
309 static void ipgre_tunnel_uninit(struct net_device *dev)
311 ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
312 dev_put(dev);
316 void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len)
318 #ifndef I_WISH_WORLD_WERE_PERFECT
320 /* It is not :-( All the routers (except for Linux) return only
321 8 bytes of packet payload. It means, that precise relaying of
322 ICMP in the real Internet is absolutely infeasible.
324 Moreover, Cisco "wise men" put GRE key to the third word
325 in GRE header. It makes impossible maintaining even soft state for keyed
326 GRE tunnels with enabled checksum. Tell them "thank you".
328 Well, I wonder, rfc1812 was written by Cisco employee,
329 what the hell these idiots break standrads established
330 by themself???
333 struct iphdr *iph = (struct iphdr*)dp;
334 u16 *p = (u16*)(dp+(iph->ihl<<2));
335 int grehlen = (iph->ihl<<2) + 4;
336 int type = skb->h.icmph->type;
337 int code = skb->h.icmph->code;
338 struct ip_tunnel *t;
339 u16 flags;
341 flags = p[0];
342 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
343 if (flags&(GRE_VERSION|GRE_ROUTING))
344 return;
345 if (flags&GRE_KEY) {
346 grehlen += 4;
347 if (flags&GRE_CSUM)
348 grehlen += 4;
352 /* If only 8 bytes returned, keyed message will be dropped here */
353 if (len < grehlen)
354 return;
356 switch (type) {
357 default:
358 case ICMP_PARAMETERPROB:
359 return;
361 case ICMP_DEST_UNREACH:
362 switch (code) {
363 case ICMP_SR_FAILED:
364 case ICMP_PORT_UNREACH:
365 /* Impossible event. */
366 return;
367 case ICMP_FRAG_NEEDED:
368 /* Soft state for pmtu is maintained by IP core. */
369 return;
370 default:
371 /* All others are translated to HOST_UNREACH.
372 rfc2003 contains "deep thoughts" about NET_UNREACH,
373 I believe they are just ether pollution. --ANK
375 break;
377 break;
378 case ICMP_TIME_EXCEEDED:
379 if (code != ICMP_EXC_TTL)
380 return;
381 break;
384 read_lock(&ipgre_lock);
385 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
386 if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
387 goto out;
389 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390 goto out;
392 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393 t->err_count++;
394 else
395 t->err_count = 1;
396 t->err_time = jiffies;
397 out:
398 read_unlock(&ipgre_lock);
399 return;
400 #else
401 struct iphdr *iph = (struct iphdr*)dp;
402 struct iphdr *eiph;
403 u16 *p = (u16*)(dp+(iph->ihl<<2));
404 int type = skb->h.icmph->type;
405 int code = skb->h.icmph->code;
406 int rel_type = 0;
407 int rel_code = 0;
408 int rel_info = 0;
409 u16 flags;
410 int grehlen = (iph->ihl<<2) + 4;
411 struct sk_buff *skb2;
412 struct rtable *rt;
414 if (p[1] != __constant_htons(ETH_P_IP))
415 return;
417 flags = p[0];
418 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
419 if (flags&(GRE_VERSION|GRE_ROUTING))
420 return;
421 if (flags&GRE_CSUM)
422 grehlen += 4;
423 if (flags&GRE_KEY)
424 grehlen += 4;
425 if (flags&GRE_SEQ)
426 grehlen += 4;
428 if (len < grehlen + sizeof(struct iphdr))
429 return;
430 eiph = (struct iphdr*)(dp + grehlen);
432 switch (type) {
433 default:
434 return;
435 case ICMP_PARAMETERPROB:
436 if (skb->h.icmph->un.gateway < (iph->ihl<<2))
437 return;
439 /* So... This guy found something strange INSIDE encapsulated
440 packet. Well, he is fool, but what can we do ?
442 rel_type = ICMP_PARAMETERPROB;
443 rel_info = skb->h.icmph->un.gateway - grehlen;
444 break;
446 case ICMP_DEST_UNREACH:
447 switch (code) {
448 case ICMP_SR_FAILED:
449 case ICMP_PORT_UNREACH:
450 /* Impossible event. */
451 return;
452 case ICMP_FRAG_NEEDED:
453 /* And it is the only really necesary thing :-) */
454 rel_info = ntohs(skb->h.icmph->un.frag.mtu);
455 if (rel_info < grehlen+68)
456 return;
457 rel_info -= grehlen;
458 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
459 if (rel_info > ntohs(eiph->tot_len))
460 return;
461 break;
462 default:
463 /* All others are translated to HOST_UNREACH.
464 rfc2003 contains "deep thoughts" about NET_UNREACH,
465 I believe, it is just ether pollution. --ANK
467 rel_type = ICMP_DEST_UNREACH;
468 rel_code = ICMP_HOST_UNREACH;
469 break;
471 break;
472 case ICMP_TIME_EXCEEDED:
473 if (code != ICMP_EXC_TTL)
474 return;
475 break;
478 /* Prepare fake skb to feed it to icmp_send */
479 skb2 = skb_clone(skb, GFP_ATOMIC);
480 if (skb2 == NULL)
481 return;
482 dst_release(skb2->dst);
483 skb2->dst = NULL;
484 skb_pull(skb2, skb->data - (u8*)eiph);
485 skb2->nh.raw = skb2->data;
487 /* Try to guess incoming interface */
488 if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
489 kfree_skb(skb2);
490 return;
492 skb2->dev = rt->u.dst.dev;
494 /* route "incoming" packet */
495 if (rt->rt_flags&RTCF_LOCAL) {
496 ip_rt_put(rt);
497 rt = NULL;
498 if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
499 rt->u.dst.dev->type != ARPHRD_IPGRE) {
500 ip_rt_put(rt);
501 kfree_skb(skb2);
502 return;
504 } else {
505 ip_rt_put(rt);
506 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
507 skb2->dst->dev->type != ARPHRD_IPGRE) {
508 kfree_skb(skb2);
509 return;
513 /* change mtu on this route */
514 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
515 if (rel_info > skb2->dst->pmtu) {
516 kfree_skb(skb2);
517 return;
519 skb2->dst->pmtu = rel_info;
520 rel_info = htonl(rel_info);
521 } else if (type == ICMP_TIME_EXCEEDED) {
522 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
523 if (t->parms.iph.ttl) {
524 rel_type = ICMP_DEST_UNREACH;
525 rel_code = ICMP_HOST_UNREACH;
529 icmp_send(skb2, rel_type, rel_code, rel_info);
530 kfree_skb(skb2);
531 #endif
534 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
536 if (INET_ECN_is_ce(iph->tos)) {
537 if (skb->protocol == __constant_htons(ETH_P_IP)) {
538 if (INET_ECN_is_not_ce(skb->nh.iph->tos))
539 IP_ECN_set_ce(skb->nh.iph);
540 } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
541 if (INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h)))
542 IP6_ECN_set_ce(skb->nh.ipv6h);
547 static inline u8
548 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 #ifdef CONFIG_INET_ECN
551 u8 inner = 0;
552 if (skb->protocol == __constant_htons(ETH_P_IP))
553 inner = old_iph->tos;
554 else if (skb->protocol == __constant_htons(ETH_P_IPV6))
555 inner = ip6_get_dsfield((struct ipv6hdr*)old_iph);
556 return INET_ECN_encapsulate(tos, inner);
557 #else
558 return tos;
559 #endif
562 int ipgre_rcv(struct sk_buff *skb, unsigned short len)
564 struct iphdr *iph = skb->nh.iph;
565 u8 *h = skb->h.raw;
566 u16 flags = *(u16*)h;
567 u16 csum = 0;
568 u32 key = 0;
569 u32 seqno = 0;
570 struct ip_tunnel *tunnel;
571 int offset = 4;
573 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
574 /* - Version must be 0.
575 - We do not support routing headers.
577 if (flags&(GRE_VERSION|GRE_ROUTING))
578 goto drop_nolock;
580 if (flags&GRE_CSUM) {
581 csum = ip_compute_csum(h, len);
582 offset += 4;
584 if (flags&GRE_KEY) {
585 key = *(u32*)(h + offset);
586 offset += 4;
588 if (flags&GRE_SEQ) {
589 seqno = ntohl(*(u32*)(h + offset));
590 offset += 4;
594 read_lock(&ipgre_lock);
595 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
596 skb->mac.raw = skb->nh.raw;
597 skb->nh.raw = skb_pull(skb, h + offset - skb->data);
598 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
599 skb->ip_summed = 0;
600 skb->protocol = *(u16*)(h + 2);
601 skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603 if (MULTICAST(iph->daddr)) {
604 /* Looped back packet, drop it! */
605 if (((struct rtable*)skb->dst)->key.iif == 0)
606 goto drop;
607 tunnel->stat.multicast++;
608 skb->pkt_type = PACKET_BROADCAST;
610 #endif
612 if (((flags&GRE_CSUM) && csum) ||
613 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614 tunnel->stat.rx_crc_errors++;
615 tunnel->stat.rx_errors++;
616 goto drop;
618 if (tunnel->parms.i_flags&GRE_SEQ) {
619 if (!(flags&GRE_SEQ) ||
620 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621 tunnel->stat.rx_fifo_errors++;
622 tunnel->stat.rx_errors++;
623 goto drop;
625 tunnel->i_seqno = seqno + 1;
627 tunnel->stat.rx_packets++;
628 tunnel->stat.rx_bytes += skb->len;
629 skb->dev = tunnel->dev;
630 dst_release(skb->dst);
631 skb->dst = NULL;
632 #ifdef CONFIG_NETFILTER
633 nf_conntrack_put(skb->nfct);
634 skb->nfct = NULL;
635 #ifdef CONFIG_NETFILTER_DEBUG
636 skb->nf_debug = 0;
637 #endif
638 #endif
639 ipgre_ecn_decapsulate(iph, skb);
640 netif_rx(skb);
641 read_unlock(&ipgre_lock);
642 return(0);
644 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
646 drop:
647 read_unlock(&ipgre_lock);
648 drop_nolock:
649 kfree_skb(skb);
650 return(0);
653 /* Need this wrapper because NF_HOOK takes the function address */
654 static inline int do_ip_send(struct sk_buff *skb)
656 return ip_send(skb);
659 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
661 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
662 struct net_device_stats *stats = &tunnel->stat;
663 struct iphdr *old_iph = skb->nh.iph;
664 struct iphdr *tiph;
665 u8 tos;
666 u16 df;
667 struct rtable *rt; /* Route to the other host */
668 struct net_device *tdev; /* Device to other host */
669 struct iphdr *iph; /* Our new IP header */
670 int max_headroom; /* The extra header space needed */
671 int gre_hlen;
672 u32 dst;
673 int mtu;
675 if (tunnel->recursion++) {
676 tunnel->stat.collisions++;
677 goto tx_error;
680 if (dev->hard_header) {
681 gre_hlen = 0;
682 tiph = (struct iphdr*)skb->data;
683 } else {
684 gre_hlen = tunnel->hlen;
685 tiph = &tunnel->parms.iph;
688 if ((dst = tiph->daddr) == 0) {
689 /* NBMA tunnel */
691 if (skb->dst == NULL) {
692 tunnel->stat.tx_fifo_errors++;
693 goto tx_error;
696 if (skb->protocol == __constant_htons(ETH_P_IP)) {
697 rt = (struct rtable*)skb->dst;
698 if ((dst = rt->rt_gateway) == 0)
699 goto tx_error_icmp;
701 #ifdef CONFIG_IPV6
702 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
703 struct in6_addr *addr6;
704 int addr_type;
705 struct neighbour *neigh = skb->dst->neighbour;
707 if (neigh == NULL)
708 goto tx_error;
710 addr6 = (struct in6_addr*)&neigh->primary_key;
711 addr_type = ipv6_addr_type(addr6);
713 if (addr_type == IPV6_ADDR_ANY) {
714 addr6 = &skb->nh.ipv6h->daddr;
715 addr_type = ipv6_addr_type(addr6);
718 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
719 goto tx_error_icmp;
721 dst = addr6->s6_addr32[3];
723 #endif
724 else
725 goto tx_error;
728 tos = tiph->tos;
729 if (tos&1) {
730 if (skb->protocol == __constant_htons(ETH_P_IP))
731 tos = old_iph->tos;
732 tos &= ~1;
735 if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
736 tunnel->stat.tx_carrier_errors++;
737 goto tx_error;
739 tdev = rt->u.dst.dev;
741 if (tdev == dev) {
742 ip_rt_put(rt);
743 tunnel->stat.collisions++;
744 goto tx_error;
747 df = tiph->frag_off;
748 mtu = rt->u.dst.pmtu - tunnel->hlen;
750 if (skb->protocol == __constant_htons(ETH_P_IP)) {
751 if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
752 skb->dst->pmtu = mtu;
754 df |= (old_iph->frag_off&__constant_htons(IP_DF));
756 if ((old_iph->frag_off&__constant_htons(IP_DF)) &&
757 mtu < ntohs(old_iph->tot_len)) {
758 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
759 ip_rt_put(rt);
760 goto tx_error;
763 #ifdef CONFIG_IPV6
764 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
765 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
767 if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
768 if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
769 rt6->rt6i_dst.plen == 128) {
770 rt6->rt6i_flags |= RTF_MODIFIED;
771 skb->dst->pmtu = mtu;
775 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
776 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
777 ip_rt_put(rt);
778 goto tx_error;
781 #endif
783 if (tunnel->err_count > 0) {
784 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
785 tunnel->err_count--;
787 dst_link_failure(skb);
788 } else
789 tunnel->err_count = 0;
792 skb->h.raw = skb->nh.raw;
794 max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
796 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
797 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
798 if (!new_skb) {
799 ip_rt_put(rt);
800 stats->tx_dropped++;
801 dev_kfree_skb(skb);
802 tunnel->recursion--;
803 return 0;
805 if (skb->sk)
806 skb_set_owner_w(new_skb, skb->sk);
807 dev_kfree_skb(skb);
808 skb = new_skb;
811 skb->nh.raw = skb_push(skb, gre_hlen);
812 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
813 dst_release(skb->dst);
814 skb->dst = &rt->u.dst;
817 * Push down and install the IPIP header.
820 iph = skb->nh.iph;
821 iph->version = 4;
822 iph->ihl = sizeof(struct iphdr) >> 2;
823 iph->frag_off = df;
824 iph->protocol = IPPROTO_GRE;
825 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
826 iph->daddr = rt->rt_dst;
827 iph->saddr = rt->rt_src;
829 if ((iph->ttl = tiph->ttl) == 0) {
830 if (skb->protocol == __constant_htons(ETH_P_IP))
831 iph->ttl = old_iph->ttl;
832 #ifdef CONFIG_IPV6
833 else if (skb->protocol == __constant_htons(ETH_P_IPV6))
834 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
835 #endif
836 else
837 iph->ttl = sysctl_ip_default_ttl;
840 ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
841 ((u16*)(iph+1))[1] = skb->protocol;
843 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
844 u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
846 if (tunnel->parms.o_flags&GRE_SEQ) {
847 ++tunnel->o_seqno;
848 *ptr = htonl(tunnel->o_seqno);
849 ptr--;
851 if (tunnel->parms.o_flags&GRE_KEY) {
852 *ptr = tunnel->parms.o_key;
853 ptr--;
855 if (tunnel->parms.o_flags&GRE_CSUM) {
856 *ptr = 0;
857 *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
861 #ifdef CONFIG_NETFILTER
862 nf_conntrack_put(skb->nfct);
863 skb->nfct = NULL;
864 #ifdef CONFIG_NETFILTER_DEBUG
865 skb->nf_debug = 0;
866 #endif
867 #endif
869 IPTUNNEL_XMIT();
870 tunnel->recursion--;
871 return 0;
873 tx_error_icmp:
874 dst_link_failure(skb);
876 tx_error:
877 stats->tx_errors++;
878 dev_kfree_skb(skb);
879 tunnel->recursion--;
880 return 0;
883 static int
884 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
886 int err = 0;
887 struct ip_tunnel_parm p;
888 struct ip_tunnel *t;
890 MOD_INC_USE_COUNT;
892 switch (cmd) {
893 case SIOCGETTUNNEL:
894 t = NULL;
895 if (dev == &ipgre_fb_tunnel_dev) {
896 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
897 err = -EFAULT;
898 break;
900 t = ipgre_tunnel_locate(&p, 0);
902 if (t == NULL)
903 t = (struct ip_tunnel*)dev->priv;
904 memcpy(&p, &t->parms, sizeof(p));
905 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
906 err = -EFAULT;
907 break;
909 case SIOCADDTUNNEL:
910 case SIOCCHGTUNNEL:
911 err = -EPERM;
912 if (!capable(CAP_NET_ADMIN))
913 goto done;
915 err = -EFAULT;
916 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
917 goto done;
919 err = -EINVAL;
920 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
921 p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)) ||
922 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
923 goto done;
924 if (p.iph.ttl)
925 p.iph.frag_off |= __constant_htons(IP_DF);
927 if (!(p.i_flags&GRE_KEY))
928 p.i_key = 0;
929 if (!(p.o_flags&GRE_KEY))
930 p.o_key = 0;
932 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
934 if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
935 t != &ipgre_fb_tunnel) {
936 if (t != NULL) {
937 if (t->dev != dev) {
938 err = -EEXIST;
939 break;
941 } else {
942 unsigned nflags=0;
944 t = (struct ip_tunnel*)dev->priv;
946 if (MULTICAST(p.iph.daddr))
947 nflags = IFF_BROADCAST;
948 else if (p.iph.daddr)
949 nflags = IFF_POINTOPOINT;
951 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
952 err = -EINVAL;
953 break;
955 ipgre_tunnel_unlink(t);
956 t->parms.iph.saddr = p.iph.saddr;
957 t->parms.iph.daddr = p.iph.daddr;
958 t->parms.i_key = p.i_key;
959 t->parms.o_key = p.o_key;
960 memcpy(dev->dev_addr, &p.iph.saddr, 4);
961 memcpy(dev->broadcast, &p.iph.daddr, 4);
962 ipgre_tunnel_link(t);
963 netdev_state_change(dev);
967 if (t) {
968 err = 0;
969 if (cmd == SIOCCHGTUNNEL) {
970 t->parms.iph.ttl = p.iph.ttl;
971 t->parms.iph.tos = p.iph.tos;
972 t->parms.iph.frag_off = p.iph.frag_off;
974 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
975 err = -EFAULT;
976 } else
977 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
978 break;
980 case SIOCDELTUNNEL:
981 err = -EPERM;
982 if (!capable(CAP_NET_ADMIN))
983 goto done;
985 if (dev == &ipgre_fb_tunnel_dev) {
986 err = -EFAULT;
987 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
988 goto done;
989 err = -ENOENT;
990 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
991 goto done;
992 err = -EPERM;
993 if (t == &ipgre_fb_tunnel)
994 goto done;
996 err = unregister_netdevice(dev);
997 break;
999 default:
1000 err = -EINVAL;
1003 done:
1004 MOD_DEC_USE_COUNT;
1005 return err;
1008 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1010 return &(((struct ip_tunnel*)dev->priv)->stat);
1013 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1015 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1016 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1017 return -EINVAL;
1018 dev->mtu = new_mtu;
1019 return 0;
1022 #ifdef CONFIG_NET_IPGRE_BROADCAST
1023 /* Nice toy. Unfortunately, useless in real life :-)
1024 It allows to construct virtual multiprotocol broadcast "LAN"
1025 over the Internet, provided multicast routing is tuned.
1028 I have no idea was this bicycle invented before me,
1029 so that I had to set ARPHRD_IPGRE to a random value.
1030 I have an impression, that Cisco could make something similar,
1031 but this feature is apparently missing in IOS<=11.2(8).
1033 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1034 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1036 ping -t 255 224.66.66.66
1038 If nobody answers, mbone does not work.
1040 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1041 ip addr add 10.66.66.<somewhat>/24 dev Universe
1042 ifconfig Universe up
1043 ifconfig Universe add fe80::<Your_real_addr>/10
1044 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1045 ftp 10.66.66.66
1047 ftp fec0:6666:6666::193.233.7.65
1052 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1053 void *daddr, void *saddr, unsigned len)
1055 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1056 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1057 u16 *p = (u16*)(iph+1);
1059 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1060 p[0] = t->parms.o_flags;
1061 p[1] = htons(type);
1064 * Set the source hardware address.
1067 if (saddr)
1068 memcpy(&iph->saddr, saddr, 4);
1070 if (daddr) {
1071 memcpy(&iph->daddr, daddr, 4);
1072 return t->hlen;
1074 if (iph->daddr && !MULTICAST(iph->daddr))
1075 return t->hlen;
1077 return -t->hlen;
1080 static int ipgre_open(struct net_device *dev)
1082 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1084 MOD_INC_USE_COUNT;
1085 if (MULTICAST(t->parms.iph.daddr)) {
1086 struct rtable *rt;
1087 if (ip_route_output(&rt, t->parms.iph.daddr,
1088 t->parms.iph.saddr, RT_TOS(t->parms.iph.tos),
1089 t->parms.link)) {
1090 MOD_DEC_USE_COUNT;
1091 return -EADDRNOTAVAIL;
1093 dev = rt->u.dst.dev;
1094 ip_rt_put(rt);
1095 if (__in_dev_get(dev) == NULL) {
1096 MOD_DEC_USE_COUNT;
1097 return -EADDRNOTAVAIL;
1099 t->mlink = dev->ifindex;
1100 ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
1102 return 0;
1105 static int ipgre_close(struct net_device *dev)
1107 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1108 if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1109 struct in_device *in_dev = inetdev_by_index(t->mlink);
1110 if (in_dev) {
1111 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1112 in_dev_put(in_dev);
1115 MOD_DEC_USE_COUNT;
1116 return 0;
1119 #endif
1121 static void ipgre_tunnel_init_gen(struct net_device *dev)
1123 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1125 dev->uninit = ipgre_tunnel_uninit;
1126 dev->destructor = ipgre_tunnel_destructor;
1127 dev->hard_start_xmit = ipgre_tunnel_xmit;
1128 dev->get_stats = ipgre_tunnel_get_stats;
1129 dev->do_ioctl = ipgre_tunnel_ioctl;
1130 dev->change_mtu = ipgre_tunnel_change_mtu;
1132 dev_init_buffers(dev);
1134 dev->type = ARPHRD_IPGRE;
1135 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1136 dev->mtu = 1500 - sizeof(struct iphdr) - 4;
1137 dev->flags = IFF_NOARP;
1138 dev->iflink = 0;
1139 dev->addr_len = 4;
1140 memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
1141 memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
1144 static int ipgre_tunnel_init(struct net_device *dev)
1146 struct net_device *tdev = NULL;
1147 struct ip_tunnel *tunnel;
1148 struct iphdr *iph;
1149 int hlen = LL_MAX_HEADER;
1150 int mtu = 1500;
1151 int addend = sizeof(struct iphdr) + 4;
1153 tunnel = (struct ip_tunnel*)dev->priv;
1154 iph = &tunnel->parms.iph;
1156 ipgre_tunnel_init_gen(dev);
1158 /* Guess output device to choose reasonable mtu and hard_header_len */
1160 if (iph->daddr) {
1161 struct rtable *rt;
1162 if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
1163 tdev = rt->u.dst.dev;
1164 ip_rt_put(rt);
1167 dev->flags |= IFF_POINTOPOINT;
1169 #ifdef CONFIG_NET_IPGRE_BROADCAST
1170 if (MULTICAST(iph->daddr)) {
1171 if (!iph->saddr)
1172 return -EINVAL;
1173 dev->flags = IFF_BROADCAST;
1174 dev->hard_header = ipgre_header;
1175 dev->open = ipgre_open;
1176 dev->stop = ipgre_close;
1178 #endif
1181 if (!tdev && tunnel->parms.link)
1182 tdev = __dev_get_by_index(tunnel->parms.link);
1184 if (tdev) {
1185 hlen = tdev->hard_header_len;
1186 mtu = tdev->mtu;
1188 dev->iflink = tunnel->parms.link;
1190 /* Precalculate GRE options length */
1191 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1192 if (tunnel->parms.o_flags&GRE_CSUM)
1193 addend += 4;
1194 if (tunnel->parms.o_flags&GRE_KEY)
1195 addend += 4;
1196 if (tunnel->parms.o_flags&GRE_SEQ)
1197 addend += 4;
1199 dev->hard_header_len = hlen + addend;
1200 dev->mtu = mtu - addend;
1201 tunnel->hlen = addend;
1202 return 0;
1205 #ifdef MODULE
1206 static int ipgre_fb_tunnel_open(struct net_device *dev)
1208 MOD_INC_USE_COUNT;
1209 return 0;
1212 static int ipgre_fb_tunnel_close(struct net_device *dev)
1214 MOD_DEC_USE_COUNT;
1215 return 0;
1217 #endif
1219 int __init ipgre_fb_tunnel_init(struct net_device *dev)
1221 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1222 struct iphdr *iph;
1224 ipgre_tunnel_init_gen(dev);
1225 #ifdef MODULE
1226 dev->open = ipgre_fb_tunnel_open;
1227 dev->stop = ipgre_fb_tunnel_close;
1228 #endif
1230 iph = &ipgre_fb_tunnel.parms.iph;
1231 iph->version = 4;
1232 iph->protocol = IPPROTO_GRE;
1233 iph->ihl = 5;
1234 tunnel->hlen = sizeof(struct iphdr) + 4;
1236 dev_hold(dev);
1237 tunnels_wc[0] = &ipgre_fb_tunnel;
1238 return 0;
1242 static struct inet_protocol ipgre_protocol = {
1243 ipgre_rcv, /* GRE handler */
1244 ipgre_err, /* TUNNEL error control */
1245 0, /* next */
1246 IPPROTO_GRE, /* protocol ID */
1247 0, /* copy */
1248 NULL, /* data */
1249 "GRE" /* name */
1254 * And now the modules code and kernel interface.
1257 #ifdef MODULE
1258 int init_module(void)
1259 #else
1260 int __init ipgre_init(void)
1261 #endif
1263 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1265 ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
1266 #ifdef MODULE
1267 register_netdev(&ipgre_fb_tunnel_dev);
1268 #else
1269 rtnl_lock();
1270 register_netdevice(&ipgre_fb_tunnel_dev);
1271 rtnl_unlock();
1272 #endif
1274 inet_add_protocol(&ipgre_protocol);
1275 return 0;
1278 #ifdef MODULE
1280 void cleanup_module(void)
1282 if ( inet_del_protocol(&ipgre_protocol) < 0 )
1283 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1285 unregister_netdev(&ipgre_fb_tunnel_dev);
1288 #endif