Linux 2.6.14.3
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv6 / ip6_output.c
blob563b442ffab8fcabd26292199a340499cddd4fbb
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
46 #include <net/sock.h>
47 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
73 static inline int ip6_output_finish(struct sk_buff *skb)
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
79 if (hh) {
80 int hh_alen;
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
106 netif_rx(newskb);
107 return 0;
111 static int ip6_output2(struct sk_buff *skb)
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
148 int ip6_output(struct sk_buff *skb)
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2);
152 else
153 return ip6_output2(skb);
157 * xmit an sk_buff (used by TCP)
160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
161 struct ipv6_txoptions *opt, int ipfragok)
163 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
164 struct in6_addr *first_hop = &fl->fl6_dst;
165 struct dst_entry *dst = skb->dst;
166 struct ipv6hdr *hdr;
167 u8 proto = fl->proto;
168 int seg_len = skb->len;
169 int hlimit, tclass;
170 u32 mtu;
172 if (opt) {
173 int head_room;
175 /* First: exthdrs may take lots of space (~8K for now)
176 MAX_HEADER is not enough.
178 head_room = opt->opt_nflen + opt->opt_flen;
179 seg_len += head_room;
180 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182 if (skb_headroom(skb) < head_room) {
183 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
184 kfree_skb(skb);
185 skb = skb2;
186 if (skb == NULL) {
187 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
188 return -ENOBUFS;
190 if (sk)
191 skb_set_owner_w(skb, sk);
193 if (opt->opt_flen)
194 ipv6_push_frag_opts(skb, opt, &proto);
195 if (opt->opt_nflen)
196 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
199 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
202 * Fill in the IPv6 header
205 hlimit = -1;
206 if (np)
207 hlimit = np->hop_limit;
208 if (hlimit < 0)
209 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
210 if (hlimit < 0)
211 hlimit = ipv6_get_hoplimit(dst->dev);
213 tclass = -1;
214 if (np)
215 tclass = np->tclass;
216 if (tclass < 0)
217 tclass = 0;
219 *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221 hdr->payload_len = htons(seg_len);
222 hdr->nexthdr = proto;
223 hdr->hop_limit = hlimit;
225 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
226 ipv6_addr_copy(&hdr->daddr, first_hop);
228 mtu = dst_mtu(dst);
229 if ((skb->len <= mtu) || ipfragok) {
230 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
231 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
232 dst_output);
235 if (net_ratelimit())
236 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
237 skb->dev = dst->dev;
238 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
239 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
240 kfree_skb(skb);
241 return -EMSGSIZE;
245 * To avoid extra problems ND packets are send through this
246 * routine. It's code duplication but I really want to avoid
247 * extra checks since ipv6_build_header is used by TCP (which
248 * is for us performance critical)
251 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
252 struct in6_addr *saddr, struct in6_addr *daddr,
253 int proto, int len)
255 struct ipv6_pinfo *np = inet6_sk(sk);
256 struct ipv6hdr *hdr;
257 int totlen;
259 skb->protocol = htons(ETH_P_IPV6);
260 skb->dev = dev;
262 totlen = len + sizeof(struct ipv6hdr);
264 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
265 skb->nh.ipv6h = hdr;
267 *(u32*)hdr = htonl(0x60000000);
269 hdr->payload_len = htons(len);
270 hdr->nexthdr = proto;
271 hdr->hop_limit = np->hop_limit;
273 ipv6_addr_copy(&hdr->saddr, saddr);
274 ipv6_addr_copy(&hdr->daddr, daddr);
276 return 0;
279 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 struct ip6_ra_chain *ra;
282 struct sock *last = NULL;
284 read_lock(&ip6_ra_lock);
285 for (ra = ip6_ra_chain; ra; ra = ra->next) {
286 struct sock *sk = ra->sk;
287 if (sk && ra->sel == sel &&
288 (!sk->sk_bound_dev_if ||
289 sk->sk_bound_dev_if == skb->dev->ifindex)) {
290 if (last) {
291 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
292 if (skb2)
293 rawv6_rcv(last, skb2);
295 last = sk;
299 if (last) {
300 rawv6_rcv(last, skb);
301 read_unlock(&ip6_ra_lock);
302 return 1;
304 read_unlock(&ip6_ra_lock);
305 return 0;
308 static inline int ip6_forward_finish(struct sk_buff *skb)
310 return dst_output(skb);
313 int ip6_forward(struct sk_buff *skb)
315 struct dst_entry *dst = skb->dst;
316 struct ipv6hdr *hdr = skb->nh.ipv6h;
317 struct inet6_skb_parm *opt = IP6CB(skb);
319 if (ipv6_devconf.forwarding == 0)
320 goto error;
322 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
323 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
324 goto drop;
327 skb->ip_summed = CHECKSUM_NONE;
330 * We DO NOT make any processing on
331 * RA packets, pushing them to user level AS IS
332 * without ane WARRANTY that application will be able
333 * to interpret them. The reason is that we
334 * cannot make anything clever here.
336 * We are not end-node, so that if packet contains
337 * AH/ESP, we cannot make anything.
338 * Defragmentation also would be mistake, RA packets
339 * cannot be fragmented, because there is no warranty
340 * that different fragments will go along one path. --ANK
342 if (opt->ra) {
343 u8 *ptr = skb->nh.raw + opt->ra;
344 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
345 return 0;
349 * check and decrement ttl
351 if (hdr->hop_limit <= 1) {
352 /* Force OUTPUT device used as source address */
353 skb->dev = dst->dev;
354 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
355 0, skb->dev);
357 kfree_skb(skb);
358 return -ETIMEDOUT;
361 if (!xfrm6_route_forward(skb)) {
362 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
363 goto drop;
365 dst = skb->dst;
367 /* IPv6 specs say nothing about it, but it is clear that we cannot
368 send redirects to source routed frames.
370 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
371 struct in6_addr *target = NULL;
372 struct rt6_info *rt;
373 struct neighbour *n = dst->neighbour;
376 * incoming and outgoing devices are the same
377 * send a redirect.
380 rt = (struct rt6_info *) dst;
381 if ((rt->rt6i_flags & RTF_GATEWAY))
382 target = (struct in6_addr*)&n->primary_key;
383 else
384 target = &hdr->daddr;
386 /* Limit redirects both by destination (here)
387 and by source (inside ndisc_send_redirect)
389 if (xrlim_allow(dst, 1*HZ))
390 ndisc_send_redirect(skb, n, target);
391 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
392 |IPV6_ADDR_LINKLOCAL)) {
393 /* This check is security critical. */
394 goto error;
397 if (skb->len > dst_mtu(dst)) {
398 /* Again, force OUTPUT device used as source address */
399 skb->dev = dst->dev;
400 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
401 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
402 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
403 kfree_skb(skb);
404 return -EMSGSIZE;
407 if (skb_cow(skb, dst->dev->hard_header_len)) {
408 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
409 goto drop;
412 hdr = skb->nh.ipv6h;
414 /* Mangling hops number delayed to point after skb COW */
416 hdr->hop_limit--;
418 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
419 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421 error:
422 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
423 drop:
424 kfree_skb(skb);
425 return -EINVAL;
428 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 to->pkt_type = from->pkt_type;
431 to->priority = from->priority;
432 to->protocol = from->protocol;
433 dst_release(to->dst);
434 to->dst = dst_clone(from->dst);
435 to->dev = from->dev;
437 #ifdef CONFIG_NET_SCHED
438 to->tc_index = from->tc_index;
439 #endif
440 #ifdef CONFIG_NETFILTER
441 to->nfmark = from->nfmark;
442 /* Connection association is same as pre-frag packet */
443 to->nfct = from->nfct;
444 nf_conntrack_get(to->nfct);
445 to->nfctinfo = from->nfctinfo;
446 #ifdef CONFIG_BRIDGE_NETFILTER
447 nf_bridge_put(to->nf_bridge);
448 to->nf_bridge = from->nf_bridge;
449 nf_bridge_get(to->nf_bridge);
450 #endif
451 #endif
454 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
456 u16 offset = sizeof(struct ipv6hdr);
457 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
458 unsigned int packet_len = skb->tail - skb->nh.raw;
459 int found_rhdr = 0;
460 *nexthdr = &skb->nh.ipv6h->nexthdr;
462 while (offset + 1 <= packet_len) {
464 switch (**nexthdr) {
466 case NEXTHDR_HOP:
467 case NEXTHDR_ROUTING:
468 case NEXTHDR_DEST:
469 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
470 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
471 offset += ipv6_optlen(exthdr);
472 *nexthdr = &exthdr->nexthdr;
473 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
474 break;
475 default :
476 return offset;
480 return offset;
483 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
485 struct net_device *dev;
486 struct sk_buff *frag;
487 struct rt6_info *rt = (struct rt6_info*)skb->dst;
488 struct ipv6hdr *tmp_hdr;
489 struct frag_hdr *fh;
490 unsigned int mtu, hlen, left, len;
491 u32 frag_id = 0;
492 int ptr, offset = 0, err=0;
493 u8 *prevhdr, nexthdr = 0;
495 dev = rt->u.dst.dev;
496 hlen = ip6_find_1stfragopt(skb, &prevhdr);
497 nexthdr = *prevhdr;
499 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
501 if (skb_shinfo(skb)->frag_list) {
502 int first_len = skb_pagelen(skb);
504 if (first_len - hlen > mtu ||
505 ((first_len - hlen) & 7) ||
506 skb_cloned(skb))
507 goto slow_path;
509 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
510 /* Correct geometry. */
511 if (frag->len > mtu ||
512 ((frag->len & 7) && frag->next) ||
513 skb_headroom(frag) < hlen)
514 goto slow_path;
516 /* Partially cloned skb? */
517 if (skb_shared(frag))
518 goto slow_path;
520 BUG_ON(frag->sk);
521 if (skb->sk) {
522 sock_hold(skb->sk);
523 frag->sk = skb->sk;
524 frag->destructor = sock_wfree;
525 skb->truesize -= frag->truesize;
529 err = 0;
530 offset = 0;
531 frag = skb_shinfo(skb)->frag_list;
532 skb_shinfo(skb)->frag_list = NULL;
533 /* BUILD HEADER */
535 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
536 if (!tmp_hdr) {
537 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538 return -ENOMEM;
541 *prevhdr = NEXTHDR_FRAGMENT;
542 memcpy(tmp_hdr, skb->nh.raw, hlen);
543 __skb_pull(skb, hlen);
544 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
545 skb->nh.raw = __skb_push(skb, hlen);
546 memcpy(skb->nh.raw, tmp_hdr, hlen);
548 ipv6_select_ident(skb, fh);
549 fh->nexthdr = nexthdr;
550 fh->reserved = 0;
551 fh->frag_off = htons(IP6_MF);
552 frag_id = fh->identification;
554 first_len = skb_pagelen(skb);
555 skb->data_len = first_len - skb_headlen(skb);
556 skb->len = first_len;
557 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
560 for (;;) {
561 /* Prepare header of the next frame,
562 * before previous one went down. */
563 if (frag) {
564 frag->ip_summed = CHECKSUM_NONE;
565 frag->h.raw = frag->data;
566 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
567 frag->nh.raw = __skb_push(frag, hlen);
568 memcpy(frag->nh.raw, tmp_hdr, hlen);
569 offset += skb->len - hlen - sizeof(struct frag_hdr);
570 fh->nexthdr = nexthdr;
571 fh->reserved = 0;
572 fh->frag_off = htons(offset);
573 if (frag->next != NULL)
574 fh->frag_off |= htons(IP6_MF);
575 fh->identification = frag_id;
576 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
577 ip6_copy_metadata(frag, skb);
580 err = output(skb);
581 if (err || !frag)
582 break;
584 skb = frag;
585 frag = skb->next;
586 skb->next = NULL;
589 if (tmp_hdr)
590 kfree(tmp_hdr);
592 if (err == 0) {
593 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
594 return 0;
597 while (frag) {
598 skb = frag->next;
599 kfree_skb(frag);
600 frag = skb;
603 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
604 return err;
607 slow_path:
608 left = skb->len - hlen; /* Space per frame */
609 ptr = hlen; /* Where to start from */
612 * Fragment the datagram.
615 *prevhdr = NEXTHDR_FRAGMENT;
618 * Keep copying data until we run out.
620 while(left > 0) {
621 len = left;
622 /* IF: it doesn't fit, use 'mtu' - the data space left */
623 if (len > mtu)
624 len = mtu;
625 /* IF: we are not sending upto and including the packet end
626 then align the next start on an eight byte boundary */
627 if (len < left) {
628 len &= ~7;
631 * Allocate buffer.
634 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
635 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
636 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
637 err = -ENOMEM;
638 goto fail;
642 * Set up data on packet
645 ip6_copy_metadata(frag, skb);
646 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
647 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
648 frag->nh.raw = frag->data;
649 fh = (struct frag_hdr*)(frag->data + hlen);
650 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
653 * Charge the memory for the fragment to any owner
654 * it might possess
656 if (skb->sk)
657 skb_set_owner_w(frag, skb->sk);
660 * Copy the packet header into the new buffer.
662 memcpy(frag->nh.raw, skb->data, hlen);
665 * Build fragment header.
667 fh->nexthdr = nexthdr;
668 fh->reserved = 0;
669 if (!frag_id) {
670 ipv6_select_ident(skb, fh);
671 frag_id = fh->identification;
672 } else
673 fh->identification = frag_id;
676 * Copy a block of the IP datagram.
678 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
679 BUG();
680 left -= len;
682 fh->frag_off = htons(offset);
683 if (left > 0)
684 fh->frag_off |= htons(IP6_MF);
685 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
687 ptr += len;
688 offset += len;
691 * Put this fragment into the sending queue.
694 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
696 err = output(frag);
697 if (err)
698 goto fail;
700 kfree_skb(skb);
701 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
702 return err;
704 fail:
705 kfree_skb(skb);
706 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
707 return err;
710 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
712 int err = 0;
714 *dst = NULL;
715 if (sk) {
716 struct ipv6_pinfo *np = inet6_sk(sk);
718 *dst = sk_dst_check(sk, np->dst_cookie);
719 if (*dst) {
720 struct rt6_info *rt = (struct rt6_info*)*dst;
722 /* Yes, checking route validity in not connected
723 case is not very simple. Take into account,
724 that we do not support routing by source, TOS,
725 and MSG_DONTROUTE --ANK (980726)
727 1. If route was host route, check that
728 cached destination is current.
729 If it is network route, we still may
730 check its validity using saved pointer
731 to the last used address: daddr_cache.
732 We do not want to save whole address now,
733 (because main consumer of this service
734 is tcp, which has not this problem),
735 so that the last trick works only on connected
736 sockets.
737 2. oif also should be the same.
740 if (((rt->rt6i_dst.plen != 128 ||
741 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
742 && (np->daddr_cache == NULL ||
743 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
744 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
745 dst_release(*dst);
746 *dst = NULL;
751 if (*dst == NULL)
752 *dst = ip6_route_output(sk, fl);
754 if ((err = (*dst)->error))
755 goto out_err_release;
757 if (ipv6_addr_any(&fl->fl6_src)) {
758 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
760 if (err)
761 goto out_err_release;
764 return 0;
766 out_err_release:
767 dst_release(*dst);
768 *dst = NULL;
769 return err;
772 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
773 int offset, int len, int odd, struct sk_buff *skb),
774 void *from, int length, int transhdrlen,
775 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
776 struct rt6_info *rt, unsigned int flags)
778 struct inet_sock *inet = inet_sk(sk);
779 struct ipv6_pinfo *np = inet6_sk(sk);
780 struct sk_buff *skb;
781 unsigned int maxfraglen, fragheaderlen;
782 int exthdrlen;
783 int hh_len;
784 int mtu;
785 int copy;
786 int err;
787 int offset = 0;
788 int csummode = CHECKSUM_NONE;
790 if (flags&MSG_PROBE)
791 return 0;
792 if (skb_queue_empty(&sk->sk_write_queue)) {
794 * setup for corking
796 if (opt) {
797 if (np->cork.opt == NULL) {
798 np->cork.opt = kmalloc(opt->tot_len,
799 sk->sk_allocation);
800 if (unlikely(np->cork.opt == NULL))
801 return -ENOBUFS;
802 } else if (np->cork.opt->tot_len < opt->tot_len) {
803 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
804 return -EINVAL;
806 memcpy(np->cork.opt, opt, opt->tot_len);
807 inet->cork.flags |= IPCORK_OPT;
808 /* need source address above miyazawa*/
810 dst_hold(&rt->u.dst);
811 np->cork.rt = rt;
812 inet->cork.fl = *fl;
813 np->cork.hop_limit = hlimit;
814 np->cork.tclass = tclass;
815 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
816 if (dst_allfrag(rt->u.dst.path))
817 inet->cork.flags |= IPCORK_ALLFRAG;
818 inet->cork.length = 0;
819 sk->sk_sndmsg_page = NULL;
820 sk->sk_sndmsg_off = 0;
821 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
822 length += exthdrlen;
823 transhdrlen += exthdrlen;
824 } else {
825 rt = np->cork.rt;
826 fl = &inet->cork.fl;
827 if (inet->cork.flags & IPCORK_OPT)
828 opt = np->cork.opt;
829 transhdrlen = 0;
830 exthdrlen = 0;
831 mtu = inet->cork.fragsize;
834 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
836 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
837 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
839 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
840 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
841 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
842 return -EMSGSIZE;
847 * Let's try using as much space as possible.
848 * Use MTU if total length of the message fits into the MTU.
849 * Otherwise, we need to reserve fragment header and
850 * fragment alignment (= 8-15 octects, in total).
852 * Note that we may need to "move" the data from the tail of
853 * of the buffer to the new fragment when we split
854 * the message.
856 * FIXME: It may be fragmented into multiple chunks
857 * at once if non-fragmentable extension headers
858 * are too large.
859 * --yoshfuji
862 inet->cork.length += length;
864 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
865 goto alloc_new_skb;
867 while (length > 0) {
868 /* Check if the remaining data fits into current packet. */
869 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
870 if (copy < length)
871 copy = maxfraglen - skb->len;
873 if (copy <= 0) {
874 char *data;
875 unsigned int datalen;
876 unsigned int fraglen;
877 unsigned int fraggap;
878 unsigned int alloclen;
879 struct sk_buff *skb_prev;
880 alloc_new_skb:
881 skb_prev = skb;
883 /* There's no room in the current skb */
884 if (skb_prev)
885 fraggap = skb_prev->len - maxfraglen;
886 else
887 fraggap = 0;
890 * If remaining data exceeds the mtu,
891 * we know we need more fragment(s).
893 datalen = length + fraggap;
894 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
895 datalen = maxfraglen - fragheaderlen;
897 fraglen = datalen + fragheaderlen;
898 if ((flags & MSG_MORE) &&
899 !(rt->u.dst.dev->features&NETIF_F_SG))
900 alloclen = mtu;
901 else
902 alloclen = datalen + fragheaderlen;
905 * The last fragment gets additional space at tail.
906 * Note: we overallocate on fragments with MSG_MODE
907 * because we have no idea if we're the last one.
909 if (datalen == length + fraggap)
910 alloclen += rt->u.dst.trailer_len;
913 * We just reserve space for fragment header.
914 * Note: this may be overallocation if the message
915 * (without MSG_MORE) fits into the MTU.
917 alloclen += sizeof(struct frag_hdr);
919 if (transhdrlen) {
920 skb = sock_alloc_send_skb(sk,
921 alloclen + hh_len,
922 (flags & MSG_DONTWAIT), &err);
923 } else {
924 skb = NULL;
925 if (atomic_read(&sk->sk_wmem_alloc) <=
926 2 * sk->sk_sndbuf)
927 skb = sock_wmalloc(sk,
928 alloclen + hh_len, 1,
929 sk->sk_allocation);
930 if (unlikely(skb == NULL))
931 err = -ENOBUFS;
933 if (skb == NULL)
934 goto error;
936 * Fill in the control structures
938 skb->ip_summed = csummode;
939 skb->csum = 0;
940 /* reserve for fragmentation */
941 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
944 * Find where to start putting bytes
946 data = skb_put(skb, fraglen);
947 skb->nh.raw = data + exthdrlen;
948 data += fragheaderlen;
949 skb->h.raw = data + exthdrlen;
951 if (fraggap) {
952 skb->csum = skb_copy_and_csum_bits(
953 skb_prev, maxfraglen,
954 data + transhdrlen, fraggap, 0);
955 skb_prev->csum = csum_sub(skb_prev->csum,
956 skb->csum);
957 data += fraggap;
958 skb_trim(skb_prev, maxfraglen);
960 copy = datalen - transhdrlen - fraggap;
961 if (copy < 0) {
962 err = -EINVAL;
963 kfree_skb(skb);
964 goto error;
965 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
966 err = -EFAULT;
967 kfree_skb(skb);
968 goto error;
971 offset += copy;
972 length -= datalen - fraggap;
973 transhdrlen = 0;
974 exthdrlen = 0;
975 csummode = CHECKSUM_NONE;
978 * Put the packet on the pending queue
980 __skb_queue_tail(&sk->sk_write_queue, skb);
981 continue;
984 if (copy > length)
985 copy = length;
987 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
988 unsigned int off;
990 off = skb->len;
991 if (getfrag(from, skb_put(skb, copy),
992 offset, copy, off, skb) < 0) {
993 __skb_trim(skb, off);
994 err = -EFAULT;
995 goto error;
997 } else {
998 int i = skb_shinfo(skb)->nr_frags;
999 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1000 struct page *page = sk->sk_sndmsg_page;
1001 int off = sk->sk_sndmsg_off;
1002 unsigned int left;
1004 if (page && (left = PAGE_SIZE - off) > 0) {
1005 if (copy >= left)
1006 copy = left;
1007 if (page != frag->page) {
1008 if (i == MAX_SKB_FRAGS) {
1009 err = -EMSGSIZE;
1010 goto error;
1012 get_page(page);
1013 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1014 frag = &skb_shinfo(skb)->frags[i];
1016 } else if(i < MAX_SKB_FRAGS) {
1017 if (copy > PAGE_SIZE)
1018 copy = PAGE_SIZE;
1019 page = alloc_pages(sk->sk_allocation, 0);
1020 if (page == NULL) {
1021 err = -ENOMEM;
1022 goto error;
1024 sk->sk_sndmsg_page = page;
1025 sk->sk_sndmsg_off = 0;
1027 skb_fill_page_desc(skb, i, page, 0, 0);
1028 frag = &skb_shinfo(skb)->frags[i];
1029 skb->truesize += PAGE_SIZE;
1030 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1031 } else {
1032 err = -EMSGSIZE;
1033 goto error;
1035 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1036 err = -EFAULT;
1037 goto error;
1039 sk->sk_sndmsg_off += copy;
1040 frag->size += copy;
1041 skb->len += copy;
1042 skb->data_len += copy;
1044 offset += copy;
1045 length -= copy;
1047 return 0;
1048 error:
1049 inet->cork.length -= length;
1050 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1051 return err;
1054 int ip6_push_pending_frames(struct sock *sk)
1056 struct sk_buff *skb, *tmp_skb;
1057 struct sk_buff **tail_skb;
1058 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1059 struct inet_sock *inet = inet_sk(sk);
1060 struct ipv6_pinfo *np = inet6_sk(sk);
1061 struct ipv6hdr *hdr;
1062 struct ipv6_txoptions *opt = np->cork.opt;
1063 struct rt6_info *rt = np->cork.rt;
1064 struct flowi *fl = &inet->cork.fl;
1065 unsigned char proto = fl->proto;
1066 int err = 0;
1068 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1069 goto out;
1070 tail_skb = &(skb_shinfo(skb)->frag_list);
1072 /* move skb->data to ip header from ext header */
1073 if (skb->data < skb->nh.raw)
1074 __skb_pull(skb, skb->nh.raw - skb->data);
1075 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1076 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1077 *tail_skb = tmp_skb;
1078 tail_skb = &(tmp_skb->next);
1079 skb->len += tmp_skb->len;
1080 skb->data_len += tmp_skb->len;
1081 skb->truesize += tmp_skb->truesize;
1082 __sock_put(tmp_skb->sk);
1083 tmp_skb->destructor = NULL;
1084 tmp_skb->sk = NULL;
1087 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1088 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1089 if (opt && opt->opt_flen)
1090 ipv6_push_frag_opts(skb, opt, &proto);
1091 if (opt && opt->opt_nflen)
1092 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1094 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1096 *(u32*)hdr = fl->fl6_flowlabel |
1097 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1099 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1100 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1101 else
1102 hdr->payload_len = 0;
1103 hdr->hop_limit = np->cork.hop_limit;
1104 hdr->nexthdr = proto;
1105 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1106 ipv6_addr_copy(&hdr->daddr, final_dst);
1108 skb->dst = dst_clone(&rt->u.dst);
1109 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1110 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1111 if (err) {
1112 if (err > 0)
1113 err = np->recverr ? net_xmit_errno(err) : 0;
1114 if (err)
1115 goto error;
1118 out:
1119 inet->cork.flags &= ~IPCORK_OPT;
1120 if (np->cork.opt) {
1121 kfree(np->cork.opt);
1122 np->cork.opt = NULL;
1124 if (np->cork.rt) {
1125 dst_release(&np->cork.rt->u.dst);
1126 np->cork.rt = NULL;
1127 inet->cork.flags &= ~IPCORK_ALLFRAG;
1129 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1130 return err;
1131 error:
1132 goto out;
1135 void ip6_flush_pending_frames(struct sock *sk)
1137 struct inet_sock *inet = inet_sk(sk);
1138 struct ipv6_pinfo *np = inet6_sk(sk);
1139 struct sk_buff *skb;
1141 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1142 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1143 kfree_skb(skb);
1146 inet->cork.flags &= ~IPCORK_OPT;
1148 if (np->cork.opt) {
1149 kfree(np->cork.opt);
1150 np->cork.opt = NULL;
1152 if (np->cork.rt) {
1153 dst_release(&np->cork.rt->u.dst);
1154 np->cork.rt = NULL;
1155 inet->cork.flags &= ~IPCORK_ALLFRAG;
1157 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));