Import 2.3.16
[davej-history.git] / net / ipv4 / ip_gre.c
blob0a54020301d831e83ed9f066c1397af3dece044a
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ipip.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
39 #ifdef CONFIG_IPV6
40 #include <net/ipv6.h>
41 #include <net/ip6_fib.h>
42 #include <net/ip6_route.h>
43 #endif
46 Problems & solutions
47 --------------------
49 1. The most important issue is detecting local dead loops.
50 They would cause complete host lockup in transmit, which
51 would be "resolved" by stack overflow or, if queueing is enabled,
52 with infinite looping in net_bh.
54 We cannot track such dead loops during route installation,
55 it is infeasible task. The most general solutions would be
56 to keep skb->encapsulation counter (sort of local ttl),
57 and silently drop packet when it expires. It is the best
58 solution, but it supposes maintaing new variable in ALL
59 skb, even if no tunneling is used.
61 Current solution: t->recursion lock breaks dead loops. It looks
62 like dev->tbusy flag, but I preferred new variable, because
63 the semantics is different. One day, when hard_start_xmit
64 will be multithreaded we will have to use skb->encapsulation.
68 2. Networking dead loops would not kill routers, but would really
69 kill network. IP hop limit plays role of "t->recursion" in this case,
70 if we copy it from packet being encapsulated to upper header.
71 It is very good solution, but it introduces two problems:
73 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
74 do not work over tunnels.
75 - traceroute does not work. I planned to relay ICMP from tunnel,
76 so that this problem would be solved and traceroute output
77 would even more informative. This idea appeared to be wrong:
78 only Linux complies to rfc1812 now (yes, guys, Linux is the only
79 true router now :-)), all routers (at least, in neighbourhood of mine)
80 return only 8 bytes of payload. It is the end.
82 Hence, if we want that OSPF worked or traceroute said something reasonable,
83 we should search for another solution.
85 One of them is to parse packet trying to detect inner encapsulation
86 made by our node. It is difficult or even impossible, especially,
87 taking into account fragmentation. TO be short, tt is not solution at all.
89 Current solution: The solution was UNEXPECTEDLY SIMPLE.
90 We force DF flag on tunnels with preconfigured hop limit,
91 that is ALL. :-) Well, it does not remove the problem completely,
92 but exponential growth of network traffic is changed to linear
93 (branches, that exceed pmtu are pruned) and tunnel mtu
94 fastly degrades to value <68, where looping stops.
95 Yes, it is not good if there exists a router in the loop,
96 which does not force DF, even when encapsulating packets have DF set.
97 But it is not our problem! Nobody could accuse us, we made
98 all that we could make. Even if it is your gated who injected
99 fatal route to network, even if it were you who configured
100 fatal static route: you are innocent. :-)
104 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
105 practically identical code. It would be good to glue them
106 together, but it is not very evident, how to make them modular.
107 sit is integral part of IPv6, ipip and gre are naturally modular.
108 We could extract common parts (hash table, ioctl etc)
109 to a separate module (ip_tunnel.c).
111 Alexey Kuznetsov.
114 static int ipgre_tunnel_init(struct net_device *dev);
116 /* Fallback tunnel: no source, no destination, no key, no options */
118 static int ipgre_fb_tunnel_init(struct net_device *dev);
120 static struct net_device ipgre_fb_tunnel_dev = {
121 NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
124 static struct ip_tunnel ipgre_fb_tunnel = {
125 NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
128 /* Tunnel hash table */
131 4 hash tables:
133 3: (remote,local)
134 2: (remote,*)
135 1: (*,local)
136 0: (*,*)
138 We require exact key match i.e. if a key is present in packet
139 it will match only tunnel with the same key; if it is not present,
140 it will match only keyless tunnel.
142 All keysless packets, if not matched configured keyless tunnels
143 will match fallback tunnel.
146 #define HASH_SIZE 16
147 #define HASH(addr) ((addr^(addr>>4))&0xF)
149 static struct ip_tunnel *tunnels[4][HASH_SIZE];
151 #define tunnels_r_l (tunnels[3])
152 #define tunnels_r (tunnels[2])
153 #define tunnels_l (tunnels[1])
154 #define tunnels_wc (tunnels[0])
156 static rwlock_t ipgre_lock = RW_LOCK_UNLOCKED;
158 /* Given src, dst and key, find approriate for input tunnel. */
160 static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
162 unsigned h0 = HASH(remote);
163 unsigned h1 = HASH(key);
164 struct ip_tunnel *t;
166 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
167 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
168 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
169 return t;
172 for (t = tunnels_r[h0^h1]; t; t = t->next) {
173 if (remote == t->parms.iph.daddr) {
174 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
175 return t;
178 for (t = tunnels_l[h1]; t; t = t->next) {
179 if (local == t->parms.iph.saddr ||
180 (local == t->parms.iph.daddr && MULTICAST(local))) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182 return t;
185 for (t = tunnels_wc[h1]; t; t = t->next) {
186 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
187 return t;
189 if (ipgre_fb_tunnel_dev.flags&IFF_UP)
190 return &ipgre_fb_tunnel;
191 return NULL;
194 static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
196 u32 remote = t->parms.iph.daddr;
197 u32 local = t->parms.iph.saddr;
198 u32 key = t->parms.i_key;
199 unsigned h = HASH(key);
200 int prio = 0;
202 if (local)
203 prio |= 1;
204 if (remote && !MULTICAST(remote)) {
205 prio |= 2;
206 h ^= HASH(remote);
209 return &tunnels[prio][h];
212 static void ipgre_tunnel_link(struct ip_tunnel *t)
214 struct ip_tunnel **tp = ipgre_bucket(t);
216 t->next = *tp;
217 write_lock_bh(&ipgre_lock);
218 *tp = t;
219 write_unlock_bh(&ipgre_lock);
222 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
224 struct ip_tunnel **tp;
226 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
227 if (t == *tp) {
228 write_lock_bh(&ipgre_lock);
229 *tp = t->next;
230 write_unlock_bh(&ipgre_lock);
231 break;
236 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
238 u32 remote = parms->iph.daddr;
239 u32 local = parms->iph.saddr;
240 u32 key = parms->i_key;
241 struct ip_tunnel *t, **tp, *nt;
242 struct net_device *dev;
243 unsigned h = HASH(key);
244 int prio = 0;
246 if (local)
247 prio |= 1;
248 if (remote && !MULTICAST(remote)) {
249 prio |= 2;
250 h ^= HASH(remote);
252 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
253 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
254 if (key == t->parms.i_key)
255 return t;
258 if (!create)
259 return NULL;
261 MOD_INC_USE_COUNT;
262 dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
263 if (dev == NULL) {
264 MOD_DEC_USE_COUNT;
265 return NULL;
267 memset(dev, 0, sizeof(*dev) + sizeof(*t));
268 dev->priv = (void*)(dev+1);
269 nt = (struct ip_tunnel*)dev->priv;
270 nt->dev = dev;
271 dev->name = nt->parms.name;
272 dev->init = ipgre_tunnel_init;
273 dev->new_style = 1;
274 memcpy(&nt->parms, parms, sizeof(*parms));
275 if (dev->name[0] == 0) {
276 int i;
277 for (i=1; i<100; i++) {
278 sprintf(dev->name, "gre%d", i);
279 if (__dev_get_by_name(dev->name) == NULL)
280 break;
282 if (i==100)
283 goto failed;
284 memcpy(parms->name, dev->name, IFNAMSIZ);
286 if (register_netdevice(dev) < 0)
287 goto failed;
289 dev_hold(dev);
290 ipgre_tunnel_link(nt);
291 /* Do not decrement MOD_USE_COUNT here. */
292 return nt;
294 failed:
295 kfree(dev);
296 MOD_DEC_USE_COUNT;
297 return NULL;
300 static void ipgre_tunnel_destructor(struct net_device *dev)
302 if (dev != &ipgre_fb_tunnel_dev) {
303 MOD_DEC_USE_COUNT;
307 static void ipgre_tunnel_uninit(struct net_device *dev)
309 ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
310 dev_put(dev);
314 void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len)
316 #ifndef I_WISH_WORLD_WERE_PERFECT
318 /* It is not :-( All the routers (except for Linux) return only
319 8 bytes of packet payload. It means, that precise relaying of
320 ICMP in the real Internet is absolutely infeasible.
322 Moreover, Cisco "wise men" put GRE key to the third word
323 in GRE header. It makes impossible maintaining even soft state for keyed
324 GRE tunnels with enabled checksum. Tell them "thank you".
326 Well, I wonder, rfc1812 was written by Cisco employee,
327 what the hell these idiots break standrads established
328 by themself???
331 struct iphdr *iph = (struct iphdr*)dp;
332 u16 *p = (u16*)(dp+(iph->ihl<<2));
333 int grehlen = (iph->ihl<<2) + 4;
334 int type = skb->h.icmph->type;
335 int code = skb->h.icmph->code;
336 struct ip_tunnel *t;
337 u16 flags;
339 flags = p[0];
340 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
341 if (flags&(GRE_VERSION|GRE_ROUTING))
342 return;
343 if (flags&GRE_KEY) {
344 grehlen += 4;
345 if (flags&GRE_CSUM)
346 grehlen += 4;
350 /* If only 8 bytes returned, keyed message will be dropped here */
351 if (len < grehlen)
352 return;
354 switch (type) {
355 default:
356 case ICMP_PARAMETERPROB:
357 return;
359 case ICMP_DEST_UNREACH:
360 switch (code) {
361 case ICMP_SR_FAILED:
362 case ICMP_PORT_UNREACH:
363 /* Impossible event. */
364 return;
365 case ICMP_FRAG_NEEDED:
366 /* Soft state for pmtu is maintained by IP core. */
367 return;
368 default:
369 /* All others are translated to HOST_UNREACH.
370 rfc2003 contains "deep thoughts" about NET_UNREACH,
371 I believe they are just ether pollution. --ANK
373 break;
375 break;
376 case ICMP_TIME_EXCEEDED:
377 if (code != ICMP_EXC_TTL)
378 return;
379 break;
382 read_lock(&ipgre_lock);
383 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
384 if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
385 goto out;
387 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
388 goto out;
390 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
391 t->err_count++;
392 else
393 t->err_count = 1;
394 t->err_time = jiffies;
395 out:
396 read_unlock(&ipgre_lock);
397 return;
398 #else
399 struct iphdr *iph = (struct iphdr*)dp;
400 struct iphdr *eiph;
401 u16 *p = (u16*)(dp+(iph->ihl<<2));
402 int type = skb->h.icmph->type;
403 int code = skb->h.icmph->code;
404 int rel_type = 0;
405 int rel_code = 0;
406 int rel_info = 0;
407 u16 flags;
408 int grehlen = (iph->ihl<<2) + 4;
409 struct sk_buff *skb2;
410 struct rtable *rt;
412 if (p[1] != __constant_htons(ETH_P_IP))
413 return;
415 flags = p[0];
416 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
417 if (flags&(GRE_VERSION|GRE_ROUTING))
418 return;
419 if (flags&GRE_CSUM)
420 grehlen += 4;
421 if (flags&GRE_KEY)
422 grehlen += 4;
423 if (flags&GRE_SEQ)
424 grehlen += 4;
426 if (len < grehlen + sizeof(struct iphdr))
427 return;
428 eiph = (struct iphdr*)(dp + grehlen);
430 switch (type) {
431 default:
432 return;
433 case ICMP_PARAMETERPROB:
434 if (skb->h.icmph->un.gateway < (iph->ihl<<2))
435 return;
437 /* So... This guy found something strange INSIDE encapsulated
438 packet. Well, he is fool, but what can we do ?
440 rel_type = ICMP_PARAMETERPROB;
441 rel_info = skb->h.icmph->un.gateway - grehlen;
442 break;
444 case ICMP_DEST_UNREACH:
445 switch (code) {
446 case ICMP_SR_FAILED:
447 case ICMP_PORT_UNREACH:
448 /* Impossible event. */
449 return;
450 case ICMP_FRAG_NEEDED:
451 /* And it is the only really necesary thing :-) */
452 rel_info = ntohs(skb->h.icmph->un.frag.mtu);
453 if (rel_info < grehlen+68)
454 return;
455 rel_info -= grehlen;
456 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
457 if (rel_info > ntohs(eiph->tot_len))
458 return;
459 break;
460 default:
461 /* All others are translated to HOST_UNREACH.
462 rfc2003 contains "deep thoughts" about NET_UNREACH,
463 I believe, it is just ether pollution. --ANK
465 rel_type = ICMP_DEST_UNREACH;
466 rel_code = ICMP_HOST_UNREACH;
467 break;
469 break;
470 case ICMP_TIME_EXCEEDED:
471 if (code != ICMP_EXC_TTL)
472 return;
473 break;
476 /* Prepare fake skb to feed it to icmp_send */
477 skb2 = skb_clone(skb, GFP_ATOMIC);
478 if (skb2 == NULL)
479 return;
480 dst_release(skb2->dst);
481 skb2->dst = NULL;
482 skb_pull(skb2, skb->data - (u8*)eiph);
483 skb2->nh.raw = skb2->data;
485 /* Try to guess incoming interface */
486 if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
487 kfree_skb(skb2);
488 return;
490 skb2->dev = rt->u.dst.dev;
492 /* route "incoming" packet */
493 if (rt->rt_flags&RTCF_LOCAL) {
494 ip_rt_put(rt);
495 rt = NULL;
496 if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
497 rt->u.dst.dev->type != ARPHRD_IPGRE) {
498 ip_rt_put(rt);
499 kfree_skb(skb2);
500 return;
502 } else {
503 ip_rt_put(rt);
504 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
505 skb2->dst->dev->type != ARPHRD_IPGRE) {
506 kfree_skb(skb2);
507 return;
511 /* change mtu on this route */
512 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
513 if (rel_info > skb2->dst->pmtu) {
514 kfree_skb(skb2);
515 return;
517 skb2->dst->pmtu = rel_info;
518 rel_info = htonl(rel_info);
519 } else if (type == ICMP_TIME_EXCEEDED) {
520 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
521 if (t->parms.iph.ttl) {
522 rel_type = ICMP_DEST_UNREACH;
523 rel_code = ICMP_HOST_UNREACH;
527 icmp_send(skb2, rel_type, rel_code, rel_info);
528 kfree_skb(skb2);
529 #endif
532 int ipgre_rcv(struct sk_buff *skb, unsigned short len)
534 struct iphdr *iph = skb->nh.iph;
535 u8 *h = skb->h.raw;
536 u16 flags = *(u16*)h;
537 u16 csum = 0;
538 u32 key = 0;
539 u32 seqno = 0;
540 struct ip_tunnel *tunnel;
541 int offset = 4;
543 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
544 /* - Version must be 0.
545 - We do not support routing headers.
547 if (flags&(GRE_VERSION|GRE_ROUTING))
548 goto drop_nolock;
550 if (flags&GRE_CSUM) {
551 csum = ip_compute_csum(h, len);
552 offset += 4;
554 if (flags&GRE_KEY) {
555 key = *(u32*)(h + offset);
556 offset += 4;
558 if (flags&GRE_SEQ) {
559 seqno = ntohl(*(u32*)(h + offset));
560 offset += 4;
564 read_lock(&ipgre_lock);
565 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
566 skb->mac.raw = skb->nh.raw;
567 skb->nh.raw = skb_pull(skb, h + offset - skb->data);
568 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
569 skb->ip_summed = 0;
570 skb->protocol = *(u16*)(h + 2);
571 skb->pkt_type = PACKET_HOST;
572 #ifdef CONFIG_NET_IPGRE_BROADCAST
573 if (MULTICAST(iph->daddr)) {
574 /* Looped back packet, drop it! */
575 if (((struct rtable*)skb->dst)->key.iif == 0)
576 goto drop;
577 tunnel->stat.multicast++;
578 skb->pkt_type = PACKET_BROADCAST;
580 #endif
582 if (((flags&GRE_CSUM) && csum) ||
583 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
584 tunnel->stat.rx_crc_errors++;
585 tunnel->stat.rx_errors++;
586 goto drop;
588 if (tunnel->parms.i_flags&GRE_SEQ) {
589 if (!(flags&GRE_SEQ) ||
590 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
591 tunnel->stat.rx_fifo_errors++;
592 tunnel->stat.rx_errors++;
593 goto drop;
595 tunnel->i_seqno = seqno + 1;
597 tunnel->stat.rx_packets++;
598 tunnel->stat.rx_bytes += skb->len;
599 skb->dev = tunnel->dev;
600 dst_release(skb->dst);
601 skb->dst = NULL;
602 netif_rx(skb);
603 read_unlock(&ipgre_lock);
604 return(0);
606 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
608 drop:
609 read_unlock(&ipgre_lock);
610 drop_nolock:
611 kfree_skb(skb);
612 return(0);
615 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
617 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
618 struct net_device_stats *stats = &tunnel->stat;
619 struct iphdr *old_iph = skb->nh.iph;
620 struct iphdr *tiph;
621 u8 tos;
622 u16 df;
623 struct rtable *rt; /* Route to the other host */
624 struct net_device *tdev; /* Device to other host */
625 struct iphdr *iph; /* Our new IP header */
626 int max_headroom; /* The extra header space needed */
627 int gre_hlen;
628 u32 dst;
629 int mtu;
631 if (tunnel->recursion++) {
632 tunnel->stat.collisions++;
633 goto tx_error;
636 if (dev->hard_header) {
637 gre_hlen = 0;
638 tiph = (struct iphdr*)skb->data;
639 } else {
640 gre_hlen = tunnel->hlen;
641 tiph = &tunnel->parms.iph;
644 if ((dst = tiph->daddr) == 0) {
645 /* NBMA tunnel */
647 if (skb->dst == NULL) {
648 tunnel->stat.tx_fifo_errors++;
649 goto tx_error;
652 if (skb->protocol == __constant_htons(ETH_P_IP)) {
653 rt = (struct rtable*)skb->dst;
654 if ((dst = rt->rt_gateway) == 0)
655 goto tx_error_icmp;
657 #ifdef CONFIG_IPV6
658 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
659 struct in6_addr *addr6;
660 int addr_type;
661 struct neighbour *neigh = skb->dst->neighbour;
663 if (neigh == NULL)
664 goto tx_error;
666 addr6 = (struct in6_addr*)&neigh->primary_key;
667 addr_type = ipv6_addr_type(addr6);
669 if (addr_type == IPV6_ADDR_ANY) {
670 addr6 = &skb->nh.ipv6h->daddr;
671 addr_type = ipv6_addr_type(addr6);
674 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
675 goto tx_error_icmp;
677 dst = addr6->s6_addr32[3];
679 #endif
680 else
681 goto tx_error;
684 tos = tiph->tos;
685 if (tos&1) {
686 if (skb->protocol == __constant_htons(ETH_P_IP))
687 tos = old_iph->tos;
688 tos &= ~1;
691 if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
692 tunnel->stat.tx_carrier_errors++;
693 goto tx_error;
695 tdev = rt->u.dst.dev;
697 if (tdev == dev) {
698 ip_rt_put(rt);
699 tunnel->stat.collisions++;
700 goto tx_error;
703 df = tiph->frag_off;
704 mtu = rt->u.dst.pmtu - tunnel->hlen;
706 if (skb->protocol == __constant_htons(ETH_P_IP)) {
707 if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
708 skb->dst->pmtu = mtu;
710 df |= (old_iph->frag_off&__constant_htons(IP_DF));
712 if ((old_iph->frag_off&__constant_htons(IP_DF)) &&
713 mtu < ntohs(old_iph->tot_len)) {
714 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
715 ip_rt_put(rt);
716 goto tx_error;
719 #ifdef CONFIG_IPV6
720 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
721 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
723 if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
724 if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
725 rt6->rt6i_dst.plen == 128) {
726 rt6->rt6i_flags |= RTF_MODIFIED;
727 skb->dst->pmtu = mtu;
731 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
732 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
733 ip_rt_put(rt);
734 goto tx_error;
737 #endif
739 if (tunnel->err_count > 0) {
740 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
741 tunnel->err_count--;
743 dst_link_failure(skb);
744 } else
745 tunnel->err_count = 0;
748 skb->h.raw = skb->nh.raw;
750 max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
752 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
753 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
754 if (!new_skb) {
755 ip_rt_put(rt);
756 stats->tx_dropped++;
757 dev_kfree_skb(skb);
758 tunnel->recursion--;
759 return 0;
761 if (skb->sk)
762 skb_set_owner_w(new_skb, skb->sk);
763 dev_kfree_skb(skb);
764 skb = new_skb;
767 skb->nh.raw = skb_push(skb, gre_hlen);
768 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
769 dst_release(skb->dst);
770 skb->dst = &rt->u.dst;
773 * Push down and install the IPIP header.
776 iph = skb->nh.iph;
777 iph->version = 4;
778 iph->ihl = sizeof(struct iphdr) >> 2;
779 iph->frag_off = df;
780 iph->protocol = IPPROTO_GRE;
781 iph->tos = tos;
782 iph->daddr = rt->rt_dst;
783 iph->saddr = rt->rt_src;
785 if ((iph->ttl = tiph->ttl) == 0) {
786 if (skb->protocol == __constant_htons(ETH_P_IP))
787 iph->ttl = old_iph->ttl;
788 #ifdef CONFIG_IPV6
789 else if (skb->protocol == __constant_htons(ETH_P_IPV6))
790 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
791 #endif
792 else
793 iph->ttl = ip_statistics.IpDefaultTTL;
796 ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
797 ((u16*)(iph+1))[1] = skb->protocol;
799 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
800 u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
802 if (tunnel->parms.o_flags&GRE_SEQ) {
803 ++tunnel->o_seqno;
804 *ptr = htonl(tunnel->o_seqno);
805 ptr--;
807 if (tunnel->parms.o_flags&GRE_KEY) {
808 *ptr = tunnel->parms.o_key;
809 ptr--;
811 if (tunnel->parms.o_flags&GRE_CSUM) {
812 *ptr = 0;
813 *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
817 iph->tot_len = htons(skb->len);
818 iph->id = htons(ip_id_count++);
819 ip_send_check(iph);
821 stats->tx_bytes += skb->len;
822 stats->tx_packets++;
823 ip_send(skb);
824 tunnel->recursion--;
825 return 0;
827 tx_error_icmp:
828 dst_link_failure(skb);
830 tx_error:
831 stats->tx_errors++;
832 dev_kfree_skb(skb);
833 tunnel->recursion--;
834 return 0;
837 static int
838 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
840 int err = 0;
841 struct ip_tunnel_parm p;
842 struct ip_tunnel *t;
844 MOD_INC_USE_COUNT;
846 switch (cmd) {
847 case SIOCGETTUNNEL:
848 t = NULL;
849 if (dev == &ipgre_fb_tunnel_dev) {
850 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
851 err = -EFAULT;
852 break;
854 t = ipgre_tunnel_locate(&p, 0);
856 if (t == NULL)
857 t = (struct ip_tunnel*)dev->priv;
858 memcpy(&p, &t->parms, sizeof(p));
859 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
860 err = -EFAULT;
861 break;
863 case SIOCADDTUNNEL:
864 case SIOCCHGTUNNEL:
865 err = -EPERM;
866 if (!capable(CAP_NET_ADMIN))
867 goto done;
869 err = -EFAULT;
870 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
871 goto done;
873 err = -EINVAL;
874 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
875 p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)) ||
876 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
877 goto done;
878 if (p.iph.ttl)
879 p.iph.frag_off |= __constant_htons(IP_DF);
881 if (!(p.i_flags&GRE_KEY))
882 p.i_key = 0;
883 if (!(p.o_flags&GRE_KEY))
884 p.o_key = 0;
886 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
888 if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
889 t != &ipgre_fb_tunnel) {
890 if (t != NULL) {
891 if (t->dev != dev) {
892 err = -EEXIST;
893 break;
895 } else {
896 unsigned nflags=0;
898 t = (struct ip_tunnel*)dev->priv;
900 if (MULTICAST(p.iph.daddr))
901 nflags = IFF_BROADCAST;
902 else if (p.iph.daddr)
903 nflags = IFF_POINTOPOINT;
905 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
906 err = -EINVAL;
907 break;
909 ipgre_tunnel_unlink(t);
910 t->parms.iph.saddr = p.iph.saddr;
911 t->parms.iph.daddr = p.iph.daddr;
912 t->parms.i_key = p.i_key;
913 t->parms.o_key = p.o_key;
914 memcpy(dev->dev_addr, &p.iph.saddr, 4);
915 memcpy(dev->broadcast, &p.iph.daddr, 4);
916 ipgre_tunnel_link(t);
917 netdev_state_change(dev);
921 if (t) {
922 err = 0;
923 if (cmd == SIOCCHGTUNNEL) {
924 t->parms.iph.ttl = p.iph.ttl;
925 t->parms.iph.tos = p.iph.tos;
926 t->parms.iph.frag_off = p.iph.frag_off;
928 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
929 err = -EFAULT;
930 } else
931 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
932 break;
934 case SIOCDELTUNNEL:
935 err = -EPERM;
936 if (!capable(CAP_NET_ADMIN))
937 goto done;
939 if (dev == &ipgre_fb_tunnel_dev) {
940 err = -EFAULT;
941 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
942 goto done;
943 err = -ENOENT;
944 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
945 goto done;
946 err = -EPERM;
947 if (t == &ipgre_fb_tunnel)
948 goto done;
950 err = unregister_netdevice(dev);
951 break;
953 default:
954 err = -EINVAL;
957 done:
958 MOD_DEC_USE_COUNT;
959 return err;
962 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
964 return &(((struct ip_tunnel*)dev->priv)->stat);
967 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
969 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
970 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
971 return -EINVAL;
972 dev->mtu = new_mtu;
973 return 0;
976 #ifdef CONFIG_NET_IPGRE_BROADCAST
977 /* Nice toy. Unfortunately, useless in real life :-)
978 It allows to construct virtual multiprotocol broadcast "LAN"
979 over the Internet, provided multicast routing is tuned.
982 I have no idea was this bicycle invented before me,
983 so that I had to set ARPHRD_IPGRE to a random value.
984 I have an impression, that Cisco could make something similar,
985 but this feature is apparently missing in IOS<=11.2(8).
987 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
988 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
990 ping -t 255 224.66.66.66
992 If nobody answers, mbone does not work.
994 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
995 ip addr add 10.66.66.<somewhat>/24 dev Universe
996 ifconfig Universe up
997 ifconfig Universe add fe80::<Your_real_addr>/10
998 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
999 ftp 10.66.66.66
1001 ftp fec0:6666:6666::193.233.7.65
1006 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1007 void *daddr, void *saddr, unsigned len)
1009 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1010 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1011 u16 *p = (u16*)(iph+1);
1013 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1014 p[0] = t->parms.o_flags;
1015 p[1] = htons(type);
1018 * Set the source hardware address.
1021 if (saddr)
1022 memcpy(&iph->saddr, saddr, 4);
1024 if (daddr) {
1025 memcpy(&iph->daddr, daddr, 4);
1026 return t->hlen;
1028 if (iph->daddr && !MULTICAST(iph->daddr))
1029 return t->hlen;
1031 return -t->hlen;
1034 static int ipgre_open(struct net_device *dev)
1036 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1038 MOD_INC_USE_COUNT;
1039 if (MULTICAST(t->parms.iph.daddr)) {
1040 struct rtable *rt;
1041 if (ip_route_output(&rt, t->parms.iph.daddr,
1042 t->parms.iph.saddr, RT_TOS(t->parms.iph.tos),
1043 t->parms.link)) {
1044 MOD_DEC_USE_COUNT;
1045 return -EADDRNOTAVAIL;
1047 dev = rt->u.dst.dev;
1048 ip_rt_put(rt);
1049 if (__in_dev_get(dev) == NULL) {
1050 MOD_DEC_USE_COUNT;
1051 return -EADDRNOTAVAIL;
1053 t->mlink = dev->ifindex;
1054 ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
1056 return 0;
1059 static int ipgre_close(struct net_device *dev)
1061 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1062 if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1063 struct in_device *in_dev = inetdev_by_index(t->mlink);
1064 if (in_dev) {
1065 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1066 in_dev_put(in_dev);
1069 MOD_DEC_USE_COUNT;
1070 return 0;
1073 #endif
1075 static void ipgre_tunnel_init_gen(struct net_device *dev)
1077 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1079 dev->uninit = ipgre_tunnel_uninit;
1080 dev->destructor = ipgre_tunnel_destructor;
1081 dev->hard_start_xmit = ipgre_tunnel_xmit;
1082 dev->get_stats = ipgre_tunnel_get_stats;
1083 dev->do_ioctl = ipgre_tunnel_ioctl;
1084 dev->change_mtu = ipgre_tunnel_change_mtu;
1086 dev_init_buffers(dev);
1088 dev->type = ARPHRD_IPGRE;
1089 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1090 dev->mtu = 1500 - sizeof(struct iphdr) - 4;
1091 dev->flags = IFF_NOARP;
1092 dev->iflink = 0;
1093 dev->addr_len = 4;
1094 memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
1095 memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
1098 static int ipgre_tunnel_init(struct net_device *dev)
1100 struct net_device *tdev = NULL;
1101 struct ip_tunnel *tunnel;
1102 struct iphdr *iph;
1103 int hlen = LL_MAX_HEADER;
1104 int mtu = 1500;
1105 int addend = sizeof(struct iphdr) + 4;
1107 tunnel = (struct ip_tunnel*)dev->priv;
1108 iph = &tunnel->parms.iph;
1110 ipgre_tunnel_init_gen(dev);
1112 /* Guess output device to choose reasonable mtu and hard_header_len */
1114 if (iph->daddr) {
1115 struct rtable *rt;
1116 if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
1117 tdev = rt->u.dst.dev;
1118 ip_rt_put(rt);
1121 dev->flags |= IFF_POINTOPOINT;
1123 #ifdef CONFIG_NET_IPGRE_BROADCAST
1124 if (MULTICAST(iph->daddr)) {
1125 if (!iph->saddr)
1126 return -EINVAL;
1127 dev->flags = IFF_BROADCAST;
1128 dev->hard_header = ipgre_header;
1129 dev->open = ipgre_open;
1130 dev->stop = ipgre_close;
1132 #endif
1135 if (!tdev && tunnel->parms.link)
1136 tdev = __dev_get_by_index(tunnel->parms.link);
1138 if (tdev) {
1139 hlen = tdev->hard_header_len;
1140 mtu = tdev->mtu;
1142 dev->iflink = tunnel->parms.link;
1144 /* Precalculate GRE options length */
1145 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1146 if (tunnel->parms.o_flags&GRE_CSUM)
1147 addend += 4;
1148 if (tunnel->parms.o_flags&GRE_KEY)
1149 addend += 4;
1150 if (tunnel->parms.o_flags&GRE_SEQ)
1151 addend += 4;
1153 dev->hard_header_len = hlen + addend;
1154 dev->mtu = mtu - addend;
1155 tunnel->hlen = addend;
1156 return 0;
1159 #ifdef MODULE
1160 static int ipgre_fb_tunnel_open(struct net_device *dev)
1162 MOD_INC_USE_COUNT;
1163 return 0;
1166 static int ipgre_fb_tunnel_close(struct net_device *dev)
1168 MOD_DEC_USE_COUNT;
1169 return 0;
1171 #endif
1173 int __init ipgre_fb_tunnel_init(struct net_device *dev)
1175 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1176 struct iphdr *iph;
1178 ipgre_tunnel_init_gen(dev);
1179 #ifdef MODULE
1180 dev->open = ipgre_fb_tunnel_open;
1181 dev->stop = ipgre_fb_tunnel_close;
1182 #endif
1184 iph = &ipgre_fb_tunnel.parms.iph;
1185 iph->version = 4;
1186 iph->protocol = IPPROTO_GRE;
1187 iph->ihl = 5;
1188 tunnel->hlen = sizeof(struct iphdr) + 4;
1190 dev_hold(dev);
1191 tunnels_wc[0] = &ipgre_fb_tunnel;
1192 return 0;
1196 static struct inet_protocol ipgre_protocol = {
1197 ipgre_rcv, /* GRE handler */
1198 ipgre_err, /* TUNNEL error control */
1199 0, /* next */
1200 IPPROTO_GRE, /* protocol ID */
1201 0, /* copy */
1202 NULL, /* data */
1203 "GRE" /* name */
1208 * And now the modules code and kernel interface.
1211 #ifdef MODULE
1212 int init_module(void)
1213 #else
1214 int __init ipgre_init(void)
1215 #endif
1217 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1219 ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
1220 ipgre_fb_tunnel_dev.name = ipgre_fb_tunnel.parms.name;
1221 #ifdef MODULE
1222 register_netdev(&ipgre_fb_tunnel_dev);
1223 #else
1224 register_netdevice(&ipgre_fb_tunnel_dev);
1225 #endif
1227 inet_add_protocol(&ipgre_protocol);
1228 return 0;
1231 #ifdef MODULE
1233 void cleanup_module(void)
1235 if ( inet_del_protocol(&ipgre_protocol) < 0 )
1236 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1238 unregister_netdev(&ipgre_fb_tunnel_dev);
1241 #endif