Import 2.3.18pre1
[davej-history.git] / net / ipv4 / ip_output.c
blob5e6b50ea7d320457deb16575aabe0a3ba4d6b20d
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
8 * Version: $Id: ip_output.c,v 1.72 1999/09/07 02:31:15 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19 * See ip_input.c for original log
21 * Fixes:
22 * Alan Cox : Missing nonblock feature in ip_build_xmit.
23 * Mike Kilburn : htons() missing in ip_build_xmit.
24 * Bradford Johnson: Fix faulty handling of some frames when
25 * no route is found.
26 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
27 * (in case if packet not accepted by
28 * output firewall rules)
29 * Mike McLagan : Routing by source
30 * Alexey Kuznetsov: use new route cache
31 * Andi Kleen: Fix broken PMTU recovery and remove
32 * some redundant tests.
33 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
34 * Andi Kleen : Replace ip_reply with ip_send_reply.
35 * Andi Kleen : Split fast and slow ip_build_xmit path
36 * for decreased register pressure on x86
37 * and more readibility.
38 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
39 * silently drop skb instead of failing with -EPERM.
42 #include <asm/uaccess.h>
43 #include <asm/system.h>
44 #include <linux/types.h>
45 #include <linux/kernel.h>
46 #include <linux/sched.h>
47 #include <linux/mm.h>
48 #include <linux/string.h>
49 #include <linux/errno.h>
50 #include <linux/config.h>
52 #include <linux/socket.h>
53 #include <linux/sockios.h>
54 #include <linux/in.h>
55 #include <linux/inet.h>
56 #include <linux/netdevice.h>
57 #include <linux/etherdevice.h>
58 #include <linux/proc_fs.h>
59 #include <linux/stat.h>
60 #include <linux/init.h>
62 #include <net/snmp.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <net/route.h>
66 #include <net/tcp.h>
67 #include <net/udp.h>
68 #include <linux/skbuff.h>
69 #include <net/sock.h>
70 #include <net/arp.h>
71 #include <net/icmp.h>
72 #include <net/raw.h>
73 #include <net/checksum.h>
74 #include <linux/igmp.h>
75 #include <linux/netfilter_ipv4.h>
76 #include <linux/mroute.h>
77 #include <linux/netlink.h>
80 * Shall we try to damage output packets if routing dev changes?
83 int sysctl_ip_dynaddr = 0;
85 int ip_id_count = 0;
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
94 /* dev_loopback_xmit for use with netfilter. */
95 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 newskb->mac.raw = newskb->data;
98 skb_pull(newskb, newskb->nh.raw - newskb->data);
99 newskb->pkt_type = PACKET_LOOPBACK;
100 newskb->ip_summed = CHECKSUM_UNNECESSARY;
101 BUG_TRAP(newskb->dst);
103 #ifdef CONFIG_NETFILTER_DEBUG
104 nf_debug_ip_loopback_xmit(newskb);
105 #endif
106 netif_rx(newskb);
107 return 0;
110 #ifdef CONFIG_NETFILTER
111 /* To preserve the cute illusion that a locally-generated packet can
112 be mangled before routing, we actually reroute if a hook altered
113 the packet. -RR */
114 static int route_me_harder(struct sk_buff *skb)
116 struct iphdr *iph = skb->nh.iph;
117 struct rtable *rt;
119 if (ip_route_output(&rt, iph->daddr, iph->saddr,
120 RT_TOS(iph->tos) | RTO_CONN,
121 skb->sk ? skb->sk->bound_dev_if : 0)) {
122 printk("route_me_harder: No more route.\n");
123 return -EINVAL;
126 /* Drop old route. */
127 dst_release(skb->dst);
129 skb->dst = &rt->u.dst;
130 return 0;
132 #endif
134 /* Do route recalc if netfilter changes skb. */
135 static inline int
136 output_maybe_reroute(struct sk_buff *skb)
138 #ifdef CONFIG_NETFILTER
139 if (skb->nfcache & NFC_ALTERED) {
140 if (route_me_harder(skb) != 0) {
141 kfree_skb(skb);
142 return -EINVAL;
145 #endif
146 return skb->dst->output(skb);
150 * Add an ip header to a skbuff and send it out.
152 void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
153 u32 saddr, u32 daddr, struct ip_options *opt)
155 struct rtable *rt = (struct rtable *)skb->dst;
156 struct iphdr *iph;
158 /* Build the IP header. */
159 if (opt)
160 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
161 else
162 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
164 iph->version = 4;
165 iph->ihl = 5;
166 iph->tos = sk->protinfo.af_inet.tos;
167 iph->frag_off = 0;
168 if (ip_dont_fragment(sk, &rt->u.dst))
169 iph->frag_off |= htons(IP_DF);
170 iph->ttl = sk->protinfo.af_inet.ttl;
171 iph->daddr = rt->rt_dst;
172 iph->saddr = rt->rt_src;
173 iph->protocol = sk->protocol;
174 iph->tot_len = htons(skb->len);
175 iph->id = htons(ip_id_count++);
176 skb->nh.iph = iph;
178 if (opt && opt->optlen) {
179 iph->ihl += opt->optlen>>2;
180 ip_options_build(skb, opt, daddr, rt, 0);
182 ip_send_check(iph);
184 /* Send it out. */
185 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, NULL,
186 output_maybe_reroute);
189 static inline int ip_finish_output2(struct sk_buff *skb)
191 struct dst_entry *dst = skb->dst;
192 struct hh_cache *hh = dst->hh;
194 #ifdef CONFIG_NETFILTER_DEBUG
195 nf_debug_ip_finish_output2(skb);
196 #endif /*CONFIG_NETFILTER_DEBUG*/
198 if (hh) {
199 read_lock_bh(&hh->hh_lock);
200 memcpy(skb->data - 16, hh->hh_data, 16);
201 read_unlock_bh(&hh->hh_lock);
202 skb_push(skb, hh->hh_len);
203 return hh->hh_output(skb);
204 } else if (dst->neighbour)
205 return dst->neighbour->output(skb);
207 printk(KERN_DEBUG "khm\n");
208 kfree_skb(skb);
209 return -EINVAL;
212 __inline__ int ip_finish_output(struct sk_buff *skb)
214 struct net_device *dev = skb->dst->dev;
216 skb->dev = dev;
217 skb->protocol = __constant_htons(ETH_P_IP);
219 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
220 ip_finish_output2);
223 int ip_mc_output(struct sk_buff *skb)
225 struct sock *sk = skb->sk;
226 struct rtable *rt = (struct rtable*)skb->dst;
227 struct net_device *dev = rt->u.dst.dev;
230 * If the indicated interface is up and running, send the packet.
232 ip_statistics.IpOutRequests++;
233 #ifdef CONFIG_IP_ROUTE_NAT
234 if (rt->rt_flags & RTCF_NAT)
235 ip_do_nat(skb);
236 #endif
238 skb->dev = dev;
239 skb->protocol = __constant_htons(ETH_P_IP);
242 * Multicasts are looped back for other local users
245 if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->protinfo.af_inet.mc_loop)) {
246 #ifdef CONFIG_IP_MROUTE
247 /* Small optimization: do not loopback not local frames,
248 which returned after forwarding; they will be dropped
249 by ip_mr_input in any case.
250 Note, that local frames are looped back to be delivered
251 to local recipients.
253 This check is duplicated in ip_mr_input at the moment.
255 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
256 #endif
258 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
259 if (newskb)
260 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL,
261 newskb->dev,
262 ip_dev_loopback_xmit);
265 /* Multicasts with ttl 0 must not go beyond the host */
267 if (skb->nh.iph->ttl == 0) {
268 kfree_skb(skb);
269 return 0;
273 if (rt->rt_flags&RTCF_BROADCAST) {
274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 if (newskb)
276 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
277 newskb->dev, ip_dev_loopback_xmit);
280 return ip_finish_output(skb);
283 int ip_output(struct sk_buff *skb)
285 #ifdef CONFIG_IP_ROUTE_NAT
286 struct rtable *rt = (struct rtable*)skb->dst;
287 #endif
289 ip_statistics.IpOutRequests++;
291 #ifdef CONFIG_IP_ROUTE_NAT
292 if (rt->rt_flags&RTCF_NAT)
293 ip_do_nat(skb);
294 #endif
296 return ip_finish_output(skb);
299 /* Queues a packet to be sent, and starts the transmitter if necessary.
300 * This routine also needs to put in the total length and compute the
301 * checksum. We use to do this in two stages, ip_build_header() then
302 * this, but that scheme created a mess when routes disappeared etc.
303 * So we do it all here, and the TCP send engine has been changed to
304 * match. (No more unroutable FIN disasters, etc. wheee...) This will
305 * most likely make other reliable transport layers above IP easier
306 * to implement under Linux.
308 static inline int ip_queue_xmit2(struct sk_buff *skb)
310 struct sock *sk = skb->sk;
311 struct rtable *rt = (struct rtable *)skb->dst;
312 struct net_device *dev;
313 struct iphdr *iph = skb->nh.iph;
315 #ifdef CONFIG_NETFILTER
316 /* BLUE-PEN-FOR-ALEXEY. I don't understand; you mean I can't
317 hold the route as I pass the packet to userspace? -- RR
319 You may hold it, if you really hold it. F.e. if netfilter
320 does not destroy handed skb with skb->dst attached, it
321 will be held. When it was stored in info->arg, then
322 it was not held apparently. Now (without second arg) it is evident,
323 that it is clean. --ANK
325 if (rt==NULL || (skb->nfcache & NFC_ALTERED)) {
326 if (route_me_harder(skb) != 0) {
327 kfree_skb(skb);
328 return -EHOSTUNREACH;
331 #endif
333 dev = rt->u.dst.dev;
335 /* This can happen when the transport layer has segments queued
336 * with a cached route, and by the time we get here things are
337 * re-routed to a device with a different MTU than the original
338 * device. Sick, but we must cover it.
340 if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
341 struct sk_buff *skb2;
343 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
344 kfree_skb(skb);
345 if (skb2 == NULL)
346 return -ENOMEM;
347 if (sk)
348 skb_set_owner_w(skb2, sk);
349 skb = skb2;
350 iph = skb->nh.iph;
353 if (skb->len > rt->u.dst.pmtu)
354 goto fragment;
356 if (ip_dont_fragment(sk, &rt->u.dst))
357 iph->frag_off |= __constant_htons(IP_DF);
359 /* Add an IP checksum. */
360 ip_send_check(iph);
362 skb->priority = sk->priority;
363 return skb->dst->output(skb);
365 fragment:
366 if (ip_dont_fragment(sk, &rt->u.dst)) {
367 /* Reject packet ONLY if TCP might fragment
368 * it itself, if were careful enough.
370 iph->frag_off |= __constant_htons(IP_DF);
371 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
373 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
374 htonl(rt->u.dst.pmtu));
375 kfree_skb(skb);
376 return -EMSGSIZE;
378 return ip_fragment(skb, skb->dst->output);
381 int ip_queue_xmit(struct sk_buff *skb)
383 struct sock *sk = skb->sk;
384 struct ip_options *opt = sk->protinfo.af_inet.opt;
385 struct rtable *rt;
386 struct iphdr *iph;
388 /* Make sure we can route this packet. */
389 rt = (struct rtable *)__sk_dst_check(sk, 0);
390 if (rt == NULL) {
391 u32 daddr;
393 /* Use correct destination address if we have options. */
394 daddr = sk->daddr;
395 if(opt && opt->srr)
396 daddr = opt->faddr;
398 /* If this fails, retransmit mechanism of transport layer will
399 * keep trying until route appears or the connection times itself
400 * out.
402 if (ip_route_output(&rt, daddr, sk->saddr,
403 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
404 sk->bound_dev_if))
405 goto no_route;
406 __sk_dst_set(sk, &rt->u.dst);
408 skb->dst = dst_clone(&rt->u.dst);
410 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
411 goto no_route;
413 /* OK, we know where to send it, allocate and build IP header. */
414 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
415 iph->version = 4;
416 iph->ihl = 5;
417 iph->tos = sk->protinfo.af_inet.tos;
418 iph->frag_off = 0;
419 iph->ttl = sk->protinfo.af_inet.ttl;
420 iph->daddr = rt->rt_dst;
421 iph->saddr = rt->rt_src;
422 iph->protocol = sk->protocol;
423 skb->nh.iph = iph;
424 /* Transport layer set skb->h.foo itself. */
426 if(opt && opt->optlen) {
427 iph->ihl += opt->optlen >> 2;
428 ip_options_build(skb, opt, sk->daddr, rt, 0);
431 iph->tot_len = htons(skb->len);
432 iph->id = htons(ip_id_count++);
434 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
435 ip_queue_xmit2);
437 no_route:
438 ip_statistics.IpOutNoRoutes++;
439 kfree_skb(skb);
440 return -EHOSTUNREACH;
444 * Build and send a packet, with as little as one copy
446 * Doesn't care much about ip options... option length can be
447 * different for fragment at 0 and other fragments.
449 * Note that the fragment at the highest offset is sent first,
450 * so the getfrag routine can fill in the TCP/UDP checksum header
451 * field in the last fragment it sends... actually it also helps
452 * the reassemblers, they can put most packets in at the head of
453 * the fragment queue, and they know the total size in advance. This
454 * last feature will measurably improve the Linux fragment handler one
455 * day.
457 * The callback has five args, an arbitrary pointer (copy of frag),
458 * the source IP address (may depend on the routing table), the
459 * destination address (char *), the offset to copy from, and the
460 * length to be copied.
463 static int ip_build_xmit_slow(struct sock *sk,
464 int getfrag (const void *,
465 char *,
466 unsigned int,
467 unsigned int),
468 const void *frag,
469 unsigned length,
470 struct ipcm_cookie *ipc,
471 struct rtable *rt,
472 int flags)
474 unsigned int fraglen, maxfraglen, fragheaderlen;
475 int err;
476 int offset, mf;
477 int mtu;
478 unsigned short id;
480 int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
481 int nfrags=0;
482 struct ip_options *opt = ipc->opt;
483 int df = 0;
485 mtu = rt->u.dst.pmtu;
486 if (ip_dont_fragment(sk, &rt->u.dst))
487 df = htons(IP_DF);
489 length -= sizeof(struct iphdr);
491 if (opt) {
492 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
493 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
494 } else {
495 fragheaderlen = sizeof(struct iphdr);
498 * Fragheaderlen is the size of 'overhead' on each buffer. Now work
499 * out the size of the frames to send.
502 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
505 if (length + fragheaderlen > 0xFFFF) {
506 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
507 return -EMSGSIZE;
511 * Start at the end of the frame by handling the remainder.
514 offset = length - (length % (maxfraglen - fragheaderlen));
517 * Amount of memory to allocate for final fragment.
520 fraglen = length - offset + fragheaderlen;
522 if (length-offset==0) {
523 fraglen = maxfraglen;
524 offset -= maxfraglen-fragheaderlen;
528 * The last fragment will not have MF (more fragments) set.
531 mf = 0;
534 * Don't fragment packets for path mtu discovery.
537 if (offset > 0 && df) {
538 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
539 return -EMSGSIZE;
541 if (flags&MSG_PROBE)
542 goto out;
545 * Get an identifier
548 id = htons(ip_id_count++);
551 * Begin outputting the bytes.
554 do {
555 char *data;
556 struct sk_buff * skb;
559 * Get the memory we require with some space left for alignment.
562 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
563 if (skb == NULL)
564 goto error;
567 * Fill in the control structures
570 skb->priority = sk->priority;
571 skb->dst = dst_clone(&rt->u.dst);
572 skb_reserve(skb, hh_len);
575 * Find where to start putting bytes.
578 data = skb_put(skb, fraglen);
579 skb->nh.iph = (struct iphdr *)data;
582 * Only write IP header onto non-raw packets
586 struct iphdr *iph = (struct iphdr *)data;
588 iph->version = 4;
589 iph->ihl = 5;
590 if (opt) {
591 iph->ihl += opt->optlen>>2;
592 ip_options_build(skb, opt,
593 ipc->addr, rt, offset);
595 iph->tos = sk->protinfo.af_inet.tos;
596 iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
597 iph->id = id;
598 iph->frag_off = htons(offset>>3);
599 iph->frag_off |= mf|df;
600 if (rt->rt_type == RTN_MULTICAST)
601 iph->ttl = sk->protinfo.af_inet.mc_ttl;
602 else
603 iph->ttl = sk->protinfo.af_inet.ttl;
604 iph->protocol = sk->protocol;
605 iph->check = 0;
606 iph->saddr = rt->rt_src;
607 iph->daddr = rt->rt_dst;
608 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
609 data += iph->ihl*4;
612 * Any further fragments will have MF set.
615 mf = htons(IP_MF);
619 * User data callback
622 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
623 err = -EFAULT;
624 kfree_skb(skb);
625 goto error;
628 offset -= (maxfraglen-fragheaderlen);
629 fraglen = maxfraglen;
631 nfrags++;
633 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
634 skb->dst->dev, output_maybe_reroute);
635 if (err) {
636 if (err > 0)
637 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
638 if (err)
639 goto error;
641 } while (offset >= 0);
643 if (nfrags>1)
644 ip_statistics.IpFragCreates += nfrags;
645 out:
646 return 0;
648 error:
649 ip_statistics.IpOutDiscards++;
650 if (nfrags>1)
651 ip_statistics.IpFragCreates += nfrags;
652 return err;
656 * Fast path for unfragmented packets.
658 int ip_build_xmit(struct sock *sk,
659 int getfrag (const void *,
660 char *,
661 unsigned int,
662 unsigned int),
663 const void *frag,
664 unsigned length,
665 struct ipcm_cookie *ipc,
666 struct rtable *rt,
667 int flags)
669 int err;
670 struct sk_buff *skb;
671 int df;
672 struct iphdr *iph;
675 * Try the simple case first. This leaves fragmented frames, and by
676 * choice RAW frames within 20 bytes of maximum size(rare) to the long path
679 if (!sk->protinfo.af_inet.hdrincl) {
680 length += sizeof(struct iphdr);
683 * Check for slow path.
685 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
686 return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
687 } else {
688 if (length > rt->u.dst.dev->mtu) {
689 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
690 return -EMSGSIZE;
693 if (flags&MSG_PROBE)
694 goto out;
697 * Do path mtu discovery if needed.
699 df = 0;
700 if (ip_dont_fragment(sk, &rt->u.dst))
701 df = htons(IP_DF);
704 * Fast path for unfragmented frames without options.
707 int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
709 skb = sock_alloc_send_skb(sk, length+hh_len+15,
710 0, flags&MSG_DONTWAIT, &err);
711 if(skb==NULL)
712 goto error;
713 skb_reserve(skb, hh_len);
716 skb->priority = sk->priority;
717 skb->dst = dst_clone(&rt->u.dst);
719 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
721 if(!sk->protinfo.af_inet.hdrincl) {
722 iph->version=4;
723 iph->ihl=5;
724 iph->tos=sk->protinfo.af_inet.tos;
725 iph->tot_len = htons(length);
726 iph->id=htons(ip_id_count++);
727 iph->frag_off = df;
728 iph->ttl=sk->protinfo.af_inet.mc_ttl;
729 if (rt->rt_type != RTN_MULTICAST)
730 iph->ttl=sk->protinfo.af_inet.ttl;
731 iph->protocol=sk->protocol;
732 iph->saddr=rt->rt_src;
733 iph->daddr=rt->rt_dst;
734 iph->check=0;
735 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
736 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
738 else
739 err = getfrag(frag, (void *)iph, 0, length);
741 if (err)
742 goto error_fault;
744 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
745 output_maybe_reroute);
746 if (err > 0)
747 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
748 if (err)
749 goto error;
750 out:
751 return 0;
753 error_fault:
754 err = -EFAULT;
755 kfree_skb(skb);
756 error:
757 ip_statistics.IpOutDiscards++;
758 return err;
764 * This IP datagram is too large to be sent in one piece. Break it up into
765 * smaller pieces (each of size equal to IP header plus
766 * a block of the data of the original IP data part) that will yet fit in a
767 * single device frame, and queue such a frame for sending.
769 * Yes this is inefficient, feel free to submit a quicker one.
772 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
774 struct iphdr *iph;
775 unsigned char *raw;
776 unsigned char *ptr;
777 struct net_device *dev;
778 struct sk_buff *skb2;
779 unsigned int mtu, hlen, left, len;
780 int offset;
781 int not_last_frag;
782 struct rtable *rt = (struct rtable*)skb->dst;
783 int err = 0;
785 dev = rt->u.dst.dev;
788 * Point into the IP datagram header.
791 raw = skb->nh.raw;
792 iph = (struct iphdr*)raw;
795 * Setup starting values.
798 hlen = iph->ihl * 4;
799 left = ntohs(iph->tot_len) - hlen; /* Space per frame */
800 mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
801 ptr = raw + hlen; /* Where to start from */
804 * Fragment the datagram.
807 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
808 not_last_frag = iph->frag_off & htons(IP_MF);
811 * Keep copying data until we run out.
814 while(left > 0) {
815 len = left;
816 /* IF: it doesn't fit, use 'mtu' - the data space left */
817 if (len > mtu)
818 len = mtu;
819 /* IF: we are not sending upto and including the packet end
820 then align the next start on an eight byte boundary */
821 if (len < left) {
822 len &= ~7;
825 * Allocate buffer.
828 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
829 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
830 err = -ENOMEM;
831 goto fail;
835 * Set up data on packet
838 skb2->pkt_type = skb->pkt_type;
839 skb2->priority = skb->priority;
840 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
841 skb_put(skb2, len + hlen);
842 skb2->nh.raw = skb2->data;
843 skb2->h.raw = skb2->data + hlen;
846 * Charge the memory for the fragment to any owner
847 * it might possess
850 if (skb->sk)
851 skb_set_owner_w(skb2, skb->sk);
852 skb2->dst = dst_clone(skb->dst);
855 * Copy the packet header into the new buffer.
858 memcpy(skb2->nh.raw, raw, hlen);
861 * Copy a block of the IP datagram.
863 memcpy(skb2->h.raw, ptr, len);
864 left -= len;
867 * Fill in the new header fields.
869 iph = skb2->nh.iph;
870 iph->frag_off = htons((offset >> 3));
872 /* ANK: dirty, but effective trick. Upgrade options only if
873 * the segment to be fragmented was THE FIRST (otherwise,
874 * options are already fixed) and make it ONCE
875 * on the initial skb, so that all the following fragments
876 * will inherit fixed options.
878 if (offset == 0)
879 ip_options_fragment(skb);
882 * Added AC : If we are fragmenting a fragment that's not the
883 * last fragment then keep MF on each bit
885 if (left > 0 || not_last_frag)
886 iph->frag_off |= htons(IP_MF);
887 ptr += len;
888 offset += len;
891 * Put this fragment into the sending queue.
894 ip_statistics.IpFragCreates++;
896 iph->tot_len = htons(len + hlen);
898 ip_send_check(iph);
900 err = output(skb2);
901 if (err)
902 goto fail;
904 kfree_skb(skb);
905 ip_statistics.IpFragOKs++;
906 return err;
908 fail:
909 kfree_skb(skb);
910 ip_statistics.IpFragFails++;
911 return err;
915 * Fetch data from kernel space and fill in checksum if needed.
917 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
918 unsigned int fraglen)
920 struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
921 u16 *pktp = (u16 *)to;
922 struct iovec *iov;
923 int len;
924 int hdrflag = 1;
926 iov = &dp->iov[0];
927 if (offset >= iov->iov_len) {
928 offset -= iov->iov_len;
929 iov++;
930 hdrflag = 0;
932 len = iov->iov_len - offset;
933 if (fraglen > len) { /* overlapping. */
934 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
935 dp->csum);
936 offset = 0;
937 fraglen -= len;
938 to += len;
939 iov++;
942 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
943 dp->csum);
945 if (hdrflag && dp->csumoffset)
946 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
947 return 0;
951 * Generic function to send a packet as reply to another packet.
952 * Used to send TCP resets so far. ICMP should use this function too.
954 * Should run single threaded per socket because it uses the sock
955 * structure to pass arguments.
957 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
958 unsigned int len)
960 struct {
961 struct ip_options opt;
962 char data[40];
963 } replyopts;
964 struct ipcm_cookie ipc;
965 u32 daddr;
966 struct rtable *rt = (struct rtable*)skb->dst;
968 if (ip_options_echo(&replyopts.opt, skb))
969 return;
971 daddr = ipc.addr = rt->rt_src;
972 ipc.opt = &replyopts.opt;
974 if (ipc.opt->srr)
975 daddr = replyopts.opt.faddr;
976 if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
977 return;
979 /* And let IP do all the hard work.
981 This chunk is not reenterable, hence spinlock.
982 Note that it uses the fact, that this function is called
983 with locally disabled BH and that sk cannot be already spinlocked.
985 bh_lock_sock(sk);
986 sk->protinfo.af_inet.tos = skb->nh.iph->tos;
987 sk->priority = skb->priority;
988 sk->protocol = skb->nh.iph->protocol;
989 ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
990 bh_unlock_sock(sk);
992 ip_rt_put(rt);
996 * IP protocol layer initialiser
999 static struct packet_type ip_packet_type =
1001 __constant_htons(ETH_P_IP),
1002 NULL, /* All devices */
1003 ip_rcv,
1004 (void*)1,
1005 NULL,
1010 #ifdef CONFIG_PROC_FS
1011 #ifdef CONFIG_IP_MULTICAST
1012 static struct proc_dir_entry proc_net_igmp = {
1013 PROC_NET_IGMP, 4, "igmp",
1014 S_IFREG | S_IRUGO, 1, 0, 0,
1015 0, &proc_net_inode_operations,
1016 ip_mc_procinfo
1018 #endif
1019 #endif
1022 * IP registers the packet type and then calls the subprotocol initialisers
1025 void __init ip_init(void)
1027 dev_add_pack(&ip_packet_type);
1029 ip_rt_init();
1031 #ifdef CONFIG_PROC_FS
1032 #ifdef CONFIG_IP_MULTICAST
1033 proc_net_register(&proc_net_igmp);
1034 #endif
1035 #endif