Import 2.2.5pre2
[davej-history.git] / net / ipv4 / ip_gre.c
blob6a7546fd50d81d483fe0571a0a16d2f710610ded
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ipip.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
39 #ifdef CONFIG_IPV6
40 #include <net/ipv6.h>
41 #include <net/ip6_fib.h>
42 #include <net/ip6_route.h>
43 #endif
46 Problems & solutions
47 --------------------
49 1. The most important issue is detecting local dead loops.
50 They would cause complete host lockup in transmit, which
51 would be "resolved" by stack overflow or, if queueing is enabled,
52 with infinite looping in net_bh.
54 We cannot track such dead loops during route installation,
55 it is infeasible task. The most general solutions would be
56 to keep skb->encapsulation counter (sort of local ttl),
57 and silently drop packet when it expires. It is the best
58 solution, but it supposes maintaing new variable in ALL
59 skb, even if no tunneling is used.
61 Current solution: t->recursion lock breaks dead loops. It looks
62 like dev->tbusy flag, but I preferred new variable, because
63 the semantics is different. One day, when hard_start_xmit
64 will be multithreaded we will have to use skb->encapsulation.
68 2. Networking dead loops would not kill routers, but would really
69 kill network. IP hop limit plays role of "t->recursion" in this case,
70 if we copy it from packet being encapsulated to upper header.
71 It is very good solution, but it introduces two problems:
73 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
74 do not work over tunnels.
75 - traceroute does not work. I planned to relay ICMP from tunnel,
76 so that this problem would be solved and traceroute output
77 would even more informative. This idea appeared to be wrong:
78 only Linux complies to rfc1812 now (yes, guys, Linux is the only
79 true router now :-)), all routers (at least, in neighbourhood of mine)
80 return only 8 bytes of payload. It is the end.
82 Hence, if we want that OSPF worked or traceroute said something reasonable,
83 we should search for another solution.
85 One of them is to parse packet trying to detect inner encapsulation
86 made by our node. It is difficult or even impossible, especially,
87 taking into account fragmentation. TO be short, tt is not solution at all.
89 Current solution: The solution was UNEXPECTEDLY SIMPLE.
90 We force DF flag on tunnels with preconfigured hop limit,
91 that is ALL. :-) Well, it does not remove the problem completely,
92 but exponential growth of network traffic is changed to linear
93 (branches, that exceed pmtu are pruned) and tunnel mtu
94 fastly degrades to value <68, where looping stops.
95 Yes, it is not good if there exists a router in the loop,
96 which does not force DF, even when encapsulating packets have DF set.
97 But it is not our problem! Nobody could accuse us, we made
98 all that we could make. Even if it is your gated who injected
99 fatal route to network, even if it were you who configured
100 fatal static route: you are innocent. :-)
104 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
105 practically identical code. It would be good to glue them
106 together, but it is not very evident, how to make them modular.
107 sit is integral part of IPv6, ipip and gre are naturally modular.
108 We could extract common parts (hash table, ioctl etc)
109 to a separate module (ip_tunnel.c).
111 Alexey Kuznetsov.
114 static int ipgre_tunnel_init(struct device *dev);
116 /* Fallback tunnel: no source, no destination, no key, no options */
118 static int ipgre_fb_tunnel_init(struct device *dev);
120 static struct device ipgre_fb_tunnel_dev = {
121 NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
124 static struct ip_tunnel ipgre_fb_tunnel = {
125 NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
128 /* Tunnel hash table */
131 4 hash tables:
133 3: (remote,local)
134 2: (remote,*)
135 1: (*,local)
136 0: (*,*)
138 We require exact key match i.e. if a key is present in packet
139 it will match only tunnel with the same key; if it is not present,
140 it will match only keyless tunnel.
142 All keysless packets, if not matched configured keyless tunnels
143 will match fallback tunnel.
146 #define HASH_SIZE 16
147 #define HASH(addr) ((addr^(addr>>4))&0xF)
149 static struct ip_tunnel *tunnels[4][HASH_SIZE];
151 #define tunnels_r_l (tunnels[3])
152 #define tunnels_r (tunnels[2])
153 #define tunnels_l (tunnels[1])
154 #define tunnels_wc (tunnels[0])
156 /* Given src, dst and key, find approriate for input tunnel. */
158 static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
160 unsigned h0 = HASH(remote);
161 unsigned h1 = HASH(key);
162 struct ip_tunnel *t;
164 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
165 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
166 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
167 return t;
170 for (t = tunnels_r[h0^h1]; t; t = t->next) {
171 if (remote == t->parms.iph.daddr) {
172 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
173 return t;
176 for (t = tunnels_l[h1]; t; t = t->next) {
177 if (local == t->parms.iph.saddr ||
178 (local == t->parms.iph.daddr && MULTICAST(local))) {
179 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
180 return t;
183 for (t = tunnels_wc[h1]; t; t = t->next) {
184 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
185 return t;
187 if (ipgre_fb_tunnel_dev.flags&IFF_UP)
188 return &ipgre_fb_tunnel;
189 return NULL;
192 static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
194 u32 remote = t->parms.iph.daddr;
195 u32 local = t->parms.iph.saddr;
196 u32 key = t->parms.i_key;
197 unsigned h = HASH(key);
198 int prio = 0;
200 if (local)
201 prio |= 1;
202 if (remote && !MULTICAST(remote)) {
203 prio |= 2;
204 h ^= HASH(remote);
207 return &tunnels[prio][h];
210 static void ipgre_tunnel_link(struct ip_tunnel *t)
212 struct ip_tunnel **tp = ipgre_bucket(t);
214 t->next = *tp;
215 wmb();
216 *tp = t;
219 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
221 struct ip_tunnel **tp;
223 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
224 if (t == *tp) {
225 *tp = t->next;
226 synchronize_bh();
227 break;
232 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
234 u32 remote = parms->iph.daddr;
235 u32 local = parms->iph.saddr;
236 u32 key = parms->i_key;
237 struct ip_tunnel *t, **tp, *nt;
238 struct device *dev;
239 unsigned h = HASH(key);
240 int prio = 0;
242 if (local)
243 prio |= 1;
244 if (remote && !MULTICAST(remote)) {
245 prio |= 2;
246 h ^= HASH(remote);
248 for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
249 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
250 if (key == t->parms.i_key)
251 return t;
254 if (!create)
255 return NULL;
257 MOD_INC_USE_COUNT;
258 dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
259 if (dev == NULL) {
260 MOD_DEC_USE_COUNT;
261 return NULL;
263 memset(dev, 0, sizeof(*dev) + sizeof(*t));
264 dev->priv = (void*)(dev+1);
265 nt = (struct ip_tunnel*)dev->priv;
266 nt->dev = dev;
267 dev->name = nt->parms.name;
268 dev->init = ipgre_tunnel_init;
269 memcpy(&nt->parms, parms, sizeof(*parms));
270 if (dev->name[0] == 0) {
271 int i;
272 for (i=1; i<100; i++) {
273 sprintf(dev->name, "gre%d", i);
274 if (dev_get(dev->name) == NULL)
275 break;
277 if (i==100)
278 goto failed;
279 memcpy(parms->name, dev->name, IFNAMSIZ);
281 if (register_netdevice(dev) < 0)
282 goto failed;
284 ipgre_tunnel_link(nt);
285 /* Do not decrement MOD_USE_COUNT here. */
286 return nt;
288 failed:
289 kfree(dev);
290 MOD_DEC_USE_COUNT;
291 return NULL;
294 static void ipgre_tunnel_destroy(struct device *dev)
296 ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
298 if (dev != &ipgre_fb_tunnel_dev) {
299 kfree(dev);
300 MOD_DEC_USE_COUNT;
305 void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len)
307 #ifndef I_WISH_WORLD_WERE_PERFECT
309 /* It is not :-( All the routers (except for Linux) return only
310 8 bytes of packet payload. It means, that precise relaying of
311 ICMP in the real Internet is absolutely infeasible.
313 Moreover, Cisco "wise men" put GRE key to the third word
314 in GRE header. It makes impossible maintaining even soft state for keyed
315 GRE tunnels with enabled checksum. Tell them "thank you".
317 Well, I wonder, rfc1812 was written by Cisco employee,
318 what the hell these idiots break standrads established
319 by themself???
322 struct iphdr *iph = (struct iphdr*)dp;
323 u16 *p = (u16*)(dp+(iph->ihl<<2));
324 int grehlen = (iph->ihl<<2) + 4;
325 int type = skb->h.icmph->type;
326 int code = skb->h.icmph->code;
327 struct ip_tunnel *t;
328 u16 flags;
330 flags = p[0];
331 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
332 if (flags&(GRE_VERSION|GRE_ROUTING))
333 return;
334 if (flags&GRE_KEY) {
335 grehlen += 4;
336 if (flags&GRE_CSUM)
337 grehlen += 4;
341 /* If only 8 bytes returned, keyed message will be dropped here */
342 if (len < grehlen)
343 return;
345 switch (type) {
346 default:
347 case ICMP_PARAMETERPROB:
348 return;
350 case ICMP_DEST_UNREACH:
351 switch (code) {
352 case ICMP_SR_FAILED:
353 case ICMP_PORT_UNREACH:
354 /* Impossible event. */
355 return;
356 case ICMP_FRAG_NEEDED:
357 /* Soft state for pmtu is maintained by IP core. */
358 return;
359 default:
360 /* All others are translated to HOST_UNREACH.
361 rfc2003 contains "deep thoughts" about NET_UNREACH,
362 I believe they are just ether pollution. --ANK
364 break;
366 break;
367 case ICMP_TIME_EXCEEDED:
368 if (code != ICMP_EXC_TTL)
369 return;
370 break;
373 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
374 if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
375 return;
377 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
378 return;
380 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
381 t->err_count++;
382 else
383 t->err_count = 1;
384 t->err_time = jiffies;
385 return;
386 #else
387 struct iphdr *iph = (struct iphdr*)dp;
388 struct iphdr *eiph;
389 u16 *p = (u16*)(dp+(iph->ihl<<2));
390 int type = skb->h.icmph->type;
391 int code = skb->h.icmph->code;
392 int rel_type = 0;
393 int rel_code = 0;
394 int rel_info = 0;
395 u16 flags;
396 int grehlen = (iph->ihl<<2) + 4;
397 struct sk_buff *skb2;
398 struct rtable *rt;
400 if (p[1] != __constant_htons(ETH_P_IP))
401 return;
403 flags = p[0];
404 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
405 if (flags&(GRE_VERSION|GRE_ROUTING))
406 return;
407 if (flags&GRE_CSUM)
408 grehlen += 4;
409 if (flags&GRE_KEY)
410 grehlen += 4;
411 if (flags&GRE_SEQ)
412 grehlen += 4;
414 if (len < grehlen + sizeof(struct iphdr))
415 return;
416 eiph = (struct iphdr*)(dp + grehlen);
418 switch (type) {
419 default:
420 return;
421 case ICMP_PARAMETERPROB:
422 if (skb->h.icmph->un.gateway < (iph->ihl<<2))
423 return;
425 /* So... This guy found something strange INSIDE encapsulated
426 packet. Well, he is fool, but what can we do ?
428 rel_type = ICMP_PARAMETERPROB;
429 rel_info = skb->h.icmph->un.gateway - grehlen;
430 break;
432 case ICMP_DEST_UNREACH:
433 switch (code) {
434 case ICMP_SR_FAILED:
435 case ICMP_PORT_UNREACH:
436 /* Impossible event. */
437 return;
438 case ICMP_FRAG_NEEDED:
439 /* And it is the only really necesary thing :-) */
440 rel_info = ntohs(skb->h.icmph->un.frag.mtu);
441 if (rel_info < grehlen+68)
442 return;
443 rel_info -= grehlen;
444 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
445 if (rel_info > ntohs(eiph->tot_len))
446 return;
447 break;
448 default:
449 /* All others are translated to HOST_UNREACH.
450 rfc2003 contains "deep thoughts" about NET_UNREACH,
451 I believe, it is just ether pollution. --ANK
453 rel_type = ICMP_DEST_UNREACH;
454 rel_code = ICMP_HOST_UNREACH;
455 break;
457 break;
458 case ICMP_TIME_EXCEEDED:
459 if (code != ICMP_EXC_TTL)
460 return;
461 break;
464 /* Prepare fake skb to feed it to icmp_send */
465 skb2 = skb_clone(skb, GFP_ATOMIC);
466 if (skb2 == NULL)
467 return;
468 dst_release(skb2->dst);
469 skb2->dst = NULL;
470 skb_pull(skb2, skb->data - (u8*)eiph);
471 skb2->nh.raw = skb2->data;
473 /* Try to guess incoming interface */
474 if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
475 kfree_skb(skb2);
476 return;
478 skb2->dev = rt->u.dst.dev;
480 /* route "incoming" packet */
481 if (rt->rt_flags&RTCF_LOCAL) {
482 ip_rt_put(rt);
483 rt = NULL;
484 if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
485 rt->u.dst.dev->type != ARPHRD_IPGRE) {
486 ip_rt_put(rt);
487 kfree_skb(skb2);
488 return;
490 } else {
491 ip_rt_put(rt);
492 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
493 skb2->dst->dev->type != ARPHRD_IPGRE) {
494 kfree_skb(skb2);
495 return;
499 /* change mtu on this route */
500 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
501 if (rel_info > skb2->dst->pmtu) {
502 kfree_skb(skb2);
503 return;
505 skb2->dst->pmtu = rel_info;
506 rel_info = htonl(rel_info);
507 } else if (type == ICMP_TIME_EXCEEDED) {
508 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
509 if (t->parms.iph.ttl) {
510 rel_type = ICMP_DEST_UNREACH;
511 rel_code = ICMP_HOST_UNREACH;
515 icmp_send(skb2, rel_type, rel_code, rel_info);
516 kfree_skb(skb2);
517 #endif
520 int ipgre_rcv(struct sk_buff *skb, unsigned short len)
522 struct iphdr *iph = skb->nh.iph;
523 u8 *h = skb->h.raw;
524 u16 flags = *(u16*)h;
525 u16 csum = 0;
526 u32 key = 0;
527 u32 seqno = 0;
528 struct ip_tunnel *tunnel;
529 int offset = 4;
531 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
532 /* - Version must be 0.
533 - We do not support routing headers.
535 if (flags&(GRE_VERSION|GRE_ROUTING))
536 goto drop;
538 if (flags&GRE_CSUM) {
539 csum = ip_compute_csum(h, len);
540 offset += 4;
542 if (flags&GRE_KEY) {
543 key = *(u32*)(h + offset);
544 offset += 4;
546 if (flags&GRE_SEQ) {
547 seqno = ntohl(*(u32*)(h + offset));
548 offset += 4;
552 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
553 skb->mac.raw = skb->nh.raw;
554 skb->nh.raw = skb_pull(skb, h + offset - skb->data);
555 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
556 skb->ip_summed = 0;
557 skb->protocol = *(u16*)(h + 2);
558 skb->pkt_type = PACKET_HOST;
559 #ifdef CONFIG_NET_IPGRE_BROADCAST
560 if (MULTICAST(iph->daddr)) {
561 /* Looped back packet, drop it! */
562 if (((struct rtable*)skb->dst)->key.iif == 0)
563 goto drop;
564 tunnel->stat.multicast++;
565 skb->pkt_type = PACKET_BROADCAST;
567 #endif
569 if (((flags&GRE_CSUM) && csum) ||
570 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
571 tunnel->stat.rx_crc_errors++;
572 tunnel->stat.rx_errors++;
573 goto drop;
575 if (tunnel->parms.i_flags&GRE_SEQ) {
576 if (!(flags&GRE_SEQ) ||
577 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
578 tunnel->stat.rx_fifo_errors++;
579 tunnel->stat.rx_errors++;
580 goto drop;
582 tunnel->i_seqno = seqno + 1;
584 tunnel->stat.rx_packets++;
585 tunnel->stat.rx_bytes += skb->len;
586 skb->dev = tunnel->dev;
587 dst_release(skb->dst);
588 skb->dst = NULL;
589 netif_rx(skb);
590 return(0);
592 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
594 drop:
595 kfree_skb(skb);
596 return(0);
599 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
601 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
602 struct net_device_stats *stats = &tunnel->stat;
603 struct iphdr *old_iph = skb->nh.iph;
604 struct iphdr *tiph;
605 u8 tos;
606 u16 df;
607 struct rtable *rt; /* Route to the other host */
608 struct device *tdev; /* Device to other host */
609 struct iphdr *iph; /* Our new IP header */
610 int max_headroom; /* The extra header space needed */
611 int gre_hlen;
612 u32 dst;
613 int mtu;
615 if (tunnel->recursion++) {
616 tunnel->stat.collisions++;
617 goto tx_error;
620 if (dev->hard_header) {
621 gre_hlen = 0;
622 tiph = (struct iphdr*)skb->data;
623 } else {
624 gre_hlen = tunnel->hlen;
625 tiph = &tunnel->parms.iph;
628 if ((dst = tiph->daddr) == 0) {
629 /* NBMA tunnel */
631 if (skb->dst == NULL) {
632 tunnel->stat.tx_fifo_errors++;
633 goto tx_error;
636 if (skb->protocol == __constant_htons(ETH_P_IP)) {
637 rt = (struct rtable*)skb->dst;
638 if ((dst = rt->rt_gateway) == 0)
639 goto tx_error_icmp;
641 #ifdef CONFIG_IPV6
642 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
643 struct in6_addr *addr6;
644 int addr_type;
645 struct neighbour *neigh = skb->dst->neighbour;
647 if (neigh == NULL)
648 goto tx_error;
650 addr6 = (struct in6_addr*)&neigh->primary_key;
651 addr_type = ipv6_addr_type(addr6);
653 if (addr_type == IPV6_ADDR_ANY) {
654 addr6 = &skb->nh.ipv6h->daddr;
655 addr_type = ipv6_addr_type(addr6);
658 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
659 goto tx_error_icmp;
661 dst = addr6->s6_addr32[3];
663 #endif
664 else
665 goto tx_error;
668 tos = tiph->tos;
669 if (tos&1) {
670 if (skb->protocol == __constant_htons(ETH_P_IP))
671 tos = old_iph->tos;
672 tos &= ~1;
675 if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
676 tunnel->stat.tx_carrier_errors++;
677 goto tx_error;
679 tdev = rt->u.dst.dev;
681 if (tdev == dev) {
682 ip_rt_put(rt);
683 tunnel->stat.collisions++;
684 goto tx_error;
687 df = tiph->frag_off;
688 mtu = rt->u.dst.pmtu - tunnel->hlen;
690 if (skb->protocol == __constant_htons(ETH_P_IP)) {
691 if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
692 skb->dst->pmtu = mtu;
694 df |= (old_iph->frag_off&__constant_htons(IP_DF));
696 if ((old_iph->frag_off&__constant_htons(IP_DF)) &&
697 mtu < ntohs(old_iph->tot_len)) {
698 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
699 ip_rt_put(rt);
700 goto tx_error;
703 #ifdef CONFIG_IPV6
704 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
705 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
707 if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
708 if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
709 rt6->rt6i_dst.plen == 128) {
710 rt6->rt6i_flags |= RTF_MODIFIED;
711 skb->dst->pmtu = mtu;
715 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
716 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
717 ip_rt_put(rt);
718 goto tx_error;
721 #endif
723 if (tunnel->err_count > 0) {
724 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
725 tunnel->err_count--;
727 dst_link_failure(skb);
728 } else
729 tunnel->err_count = 0;
732 skb->h.raw = skb->nh.raw;
734 max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
736 if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
737 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
738 if (!new_skb) {
739 ip_rt_put(rt);
740 stats->tx_dropped++;
741 dev_kfree_skb(skb);
742 tunnel->recursion--;
743 return 0;
745 if (skb->sk)
746 skb_set_owner_w(new_skb, skb->sk);
747 dev_kfree_skb(skb);
748 skb = new_skb;
751 skb->nh.raw = skb_push(skb, gre_hlen);
752 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
753 dst_release(skb->dst);
754 skb->dst = &rt->u.dst;
757 * Push down and install the IPIP header.
760 iph = skb->nh.iph;
761 iph->version = 4;
762 iph->ihl = sizeof(struct iphdr) >> 2;
763 iph->frag_off = df;
764 iph->protocol = IPPROTO_GRE;
765 iph->tos = tos;
766 iph->daddr = rt->rt_dst;
767 iph->saddr = rt->rt_src;
769 if ((iph->ttl = tiph->ttl) == 0) {
770 if (skb->protocol == __constant_htons(ETH_P_IP))
771 iph->ttl = old_iph->ttl;
772 #ifdef CONFIG_IPV6
773 else if (skb->protocol == __constant_htons(ETH_P_IPV6))
774 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
775 #endif
776 else
777 iph->ttl = ip_statistics.IpDefaultTTL;
780 ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
781 ((u16*)(iph+1))[1] = skb->protocol;
783 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
784 u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
786 if (tunnel->parms.o_flags&GRE_SEQ) {
787 ++tunnel->o_seqno;
788 *ptr = htonl(tunnel->o_seqno);
789 ptr--;
791 if (tunnel->parms.o_flags&GRE_KEY) {
792 *ptr = tunnel->parms.o_key;
793 ptr--;
795 if (tunnel->parms.o_flags&GRE_CSUM) {
796 *ptr = 0;
797 *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
801 iph->tot_len = htons(skb->len);
802 iph->id = htons(ip_id_count++);
803 ip_send_check(iph);
805 stats->tx_bytes += skb->len;
806 stats->tx_packets++;
807 ip_send(skb);
808 tunnel->recursion--;
809 return 0;
811 tx_error_icmp:
812 dst_link_failure(skb);
814 tx_error:
815 stats->tx_errors++;
816 dev_kfree_skb(skb);
817 tunnel->recursion--;
818 return 0;
821 static int
822 ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
824 int err = 0;
825 struct ip_tunnel_parm p;
826 struct ip_tunnel *t;
828 MOD_INC_USE_COUNT;
830 switch (cmd) {
831 case SIOCGETTUNNEL:
832 t = NULL;
833 if (dev == &ipgre_fb_tunnel_dev) {
834 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
835 err = -EFAULT;
836 break;
838 t = ipgre_tunnel_locate(&p, 0);
840 if (t == NULL)
841 t = (struct ip_tunnel*)dev->priv;
842 memcpy(&p, &t->parms, sizeof(p));
843 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
844 err = -EFAULT;
845 break;
847 case SIOCADDTUNNEL:
848 case SIOCCHGTUNNEL:
849 err = -EPERM;
850 if (!capable(CAP_NET_ADMIN))
851 goto done;
853 err = -EFAULT;
854 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
855 goto done;
857 err = -EINVAL;
858 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
859 p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)) ||
860 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
861 goto done;
862 if (p.iph.ttl)
863 p.iph.frag_off |= __constant_htons(IP_DF);
865 if (!(p.i_flags&GRE_KEY))
866 p.i_key = 0;
867 if (!(p.o_flags&GRE_KEY))
868 p.o_key = 0;
870 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
872 if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
873 t != &ipgre_fb_tunnel) {
874 if (t != NULL) {
875 if (t->dev != dev) {
876 err = -EEXIST;
877 break;
879 } else {
880 unsigned nflags=0;
882 t = (struct ip_tunnel*)dev->priv;
884 if (MULTICAST(p.iph.daddr))
885 nflags = IFF_BROADCAST;
886 else if (p.iph.daddr)
887 nflags = IFF_POINTOPOINT;
889 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
890 err = -EINVAL;
891 break;
893 start_bh_atomic();
894 ipgre_tunnel_unlink(t);
895 t->parms.iph.saddr = p.iph.saddr;
896 t->parms.iph.daddr = p.iph.daddr;
897 t->parms.i_key = p.i_key;
898 t->parms.o_key = p.o_key;
899 memcpy(dev->dev_addr, &p.iph.saddr, 4);
900 memcpy(dev->broadcast, &p.iph.daddr, 4);
901 ipgre_tunnel_link(t);
902 end_bh_atomic();
903 netdev_state_change(dev);
907 if (t) {
908 err = 0;
909 if (cmd == SIOCCHGTUNNEL) {
910 t->parms.iph.ttl = p.iph.ttl;
911 t->parms.iph.tos = p.iph.tos;
912 t->parms.iph.frag_off = p.iph.frag_off;
914 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
915 err = -EFAULT;
916 } else
917 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
918 break;
920 case SIOCDELTUNNEL:
921 err = -EPERM;
922 if (!capable(CAP_NET_ADMIN))
923 goto done;
925 if (dev == &ipgre_fb_tunnel_dev) {
926 err = -EFAULT;
927 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
928 goto done;
929 err = -ENOENT;
930 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
931 goto done;
932 err = -EPERM;
933 if (t == &ipgre_fb_tunnel)
934 goto done;
936 err = unregister_netdevice(dev);
937 break;
939 default:
940 err = -EINVAL;
943 done:
944 MOD_DEC_USE_COUNT;
945 return err;
948 static struct net_device_stats *ipgre_tunnel_get_stats(struct device *dev)
950 return &(((struct ip_tunnel*)dev->priv)->stat);
953 static int ipgre_tunnel_change_mtu(struct device *dev, int new_mtu)
955 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
956 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
957 return -EINVAL;
958 dev->mtu = new_mtu;
959 return 0;
962 #ifdef CONFIG_NET_IPGRE_BROADCAST
963 /* Nice toy. Unfortunately, useless in real life :-)
964 It allows to construct virtual multiprotocol broadcast "LAN"
965 over the Internet, provided multicast routing is tuned.
968 I have no idea was this bicycle invented before me,
969 so that I had to set ARPHRD_IPGRE to a random value.
970 I have an impression, that Cisco could make something similar,
971 but this feature is apparently missing in IOS<=11.2(8).
973 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
974 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
976 ping -t 255 224.66.66.66
978 If nobody answers, mbone does not work.
980 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
981 ip addr add 10.66.66.<somewhat>/24 dev Universe
982 ifconfig Universe up
983 ifconfig Universe add fe80::<Your_real_addr>/10
984 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
985 ftp 10.66.66.66
987 ftp fec0:6666:6666::193.233.7.65
992 static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short type,
993 void *daddr, void *saddr, unsigned len)
995 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
996 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
997 u16 *p = (u16*)(iph+1);
999 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1000 p[0] = t->parms.o_flags;
1001 p[1] = htons(type);
1004 * Set the source hardware address.
1007 if (saddr)
1008 memcpy(&iph->saddr, saddr, 4);
1010 if (daddr) {
1011 memcpy(&iph->daddr, daddr, 4);
1012 return t->hlen;
1014 if (iph->daddr && !MULTICAST(iph->daddr))
1015 return t->hlen;
1017 return -t->hlen;
1020 static int ipgre_open(struct device *dev)
1022 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1024 MOD_INC_USE_COUNT;
1025 if (MULTICAST(t->parms.iph.daddr)) {
1026 struct rtable *rt;
1027 if (ip_route_output(&rt, t->parms.iph.daddr,
1028 t->parms.iph.saddr, RT_TOS(t->parms.iph.tos),
1029 t->parms.link)) {
1030 MOD_DEC_USE_COUNT;
1031 return -EADDRNOTAVAIL;
1033 dev = rt->u.dst.dev;
1034 ip_rt_put(rt);
1035 if (dev->ip_ptr == NULL) {
1036 MOD_DEC_USE_COUNT;
1037 return -EADDRNOTAVAIL;
1039 t->mlink = dev->ifindex;
1040 ip_mc_inc_group(dev->ip_ptr, t->parms.iph.daddr);
1042 return 0;
1045 static int ipgre_close(struct device *dev)
1047 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1048 if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1049 dev = dev_get_by_index(t->mlink);
1050 if (dev && dev->ip_ptr)
1051 ip_mc_dec_group(dev->ip_ptr, t->parms.iph.daddr);
1053 MOD_DEC_USE_COUNT;
1054 return 0;
1057 #endif
1059 static void ipgre_tunnel_init_gen(struct device *dev)
1061 struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1063 dev->destructor = ipgre_tunnel_destroy;
1064 dev->hard_start_xmit = ipgre_tunnel_xmit;
1065 dev->get_stats = ipgre_tunnel_get_stats;
1066 dev->do_ioctl = ipgre_tunnel_ioctl;
1067 dev->change_mtu = ipgre_tunnel_change_mtu;
1069 dev_init_buffers(dev);
1071 dev->type = ARPHRD_IPGRE;
1072 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1073 dev->mtu = 1500 - sizeof(struct iphdr) - 4;
1074 dev->flags = IFF_NOARP;
1075 dev->iflink = 0;
1076 dev->addr_len = 4;
1077 memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
1078 memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
1081 static int ipgre_tunnel_init(struct device *dev)
1083 struct device *tdev = NULL;
1084 struct ip_tunnel *tunnel;
1085 struct iphdr *iph;
1086 int hlen = LL_MAX_HEADER;
1087 int mtu = 1500;
1088 int addend = sizeof(struct iphdr) + 4;
1090 tunnel = (struct ip_tunnel*)dev->priv;
1091 iph = &tunnel->parms.iph;
1093 ipgre_tunnel_init_gen(dev);
1095 /* Guess output device to choose reasonable mtu and hard_header_len */
1097 if (iph->daddr) {
1098 struct rtable *rt;
1099 if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
1100 tdev = rt->u.dst.dev;
1101 ip_rt_put(rt);
1104 dev->flags |= IFF_POINTOPOINT;
1106 #ifdef CONFIG_NET_IPGRE_BROADCAST
1107 if (MULTICAST(iph->daddr)) {
1108 if (!iph->saddr)
1109 return -EINVAL;
1110 dev->flags = IFF_BROADCAST;
1111 dev->hard_header = ipgre_header;
1112 dev->open = ipgre_open;
1113 dev->stop = ipgre_close;
1115 #endif
1118 if (!tdev && tunnel->parms.link)
1119 tdev = dev_get_by_index(tunnel->parms.link);
1121 if (tdev) {
1122 hlen = tdev->hard_header_len;
1123 mtu = tdev->mtu;
1125 dev->iflink = tunnel->parms.link;
1127 /* Precalculate GRE options length */
1128 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1129 if (tunnel->parms.o_flags&GRE_CSUM)
1130 addend += 4;
1131 if (tunnel->parms.o_flags&GRE_KEY)
1132 addend += 4;
1133 if (tunnel->parms.o_flags&GRE_SEQ)
1134 addend += 4;
1136 dev->hard_header_len = hlen + addend;
1137 dev->mtu = mtu - addend;
1138 tunnel->hlen = addend;
1139 return 0;
1142 #ifdef MODULE
1143 static int ipgre_fb_tunnel_open(struct device *dev)
1145 MOD_INC_USE_COUNT;
1146 return 0;
1149 static int ipgre_fb_tunnel_close(struct device *dev)
1151 MOD_DEC_USE_COUNT;
1152 return 0;
1154 #endif
1156 __initfunc(int ipgre_fb_tunnel_init(struct device *dev))
1158 struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1159 struct iphdr *iph;
1161 ipgre_tunnel_init_gen(dev);
1162 #ifdef MODULE
1163 dev->open = ipgre_fb_tunnel_open;
1164 dev->stop = ipgre_fb_tunnel_close;
1165 #endif
1167 iph = &ipgre_fb_tunnel.parms.iph;
1168 iph->version = 4;
1169 iph->protocol = IPPROTO_GRE;
1170 iph->ihl = 5;
1171 tunnel->hlen = sizeof(struct iphdr) + 4;
1173 tunnels_wc[0] = &ipgre_fb_tunnel;
1174 return 0;
1178 static struct inet_protocol ipgre_protocol = {
1179 ipgre_rcv, /* GRE handler */
1180 ipgre_err, /* TUNNEL error control */
1181 0, /* next */
1182 IPPROTO_GRE, /* protocol ID */
1183 0, /* copy */
1184 NULL, /* data */
1185 "GRE" /* name */
1190 * And now the modules code and kernel interface.
1193 #ifdef MODULE
1194 int init_module(void)
1195 #else
1196 __initfunc(int ipgre_init(void))
1197 #endif
1199 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1201 ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
1202 ipgre_fb_tunnel_dev.name = ipgre_fb_tunnel.parms.name;
1203 #ifdef MODULE
1204 register_netdev(&ipgre_fb_tunnel_dev);
1205 #else
1206 register_netdevice(&ipgre_fb_tunnel_dev);
1207 #endif
1209 inet_add_protocol(&ipgre_protocol);
1210 return 0;
1213 #ifdef MODULE
1215 void cleanup_module(void)
1217 if ( inet_del_protocol(&ipgre_protocol) < 0 )
1218 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1220 unregister_netdev(&ipgre_fb_tunnel_dev);
1223 #endif