[PATCH] slab: minor cleanup to kmem_cache_alloc_node
[linux-2.6/cjktty.git] / net / ipv6 / ip6_output.c
blobefa3e72cfcfa3d536a425d9c466026c46acf5113
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
46 #include <net/sock.h>
47 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
73 static inline int ip6_output_finish(struct sk_buff *skb)
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
79 if (hh) {
80 int hh_alen;
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
106 netif_rx(newskb);
107 return 0;
111 static int ip6_output2(struct sk_buff *skb)
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
148 int ip6_output(struct sk_buff *skb)
150 if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 dst_allfrag(skb->dst))
152 return ip6_fragment(skb, ip6_output2);
153 else
154 return ip6_output2(skb);
158 * xmit an sk_buff (used by TCP)
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 struct ipv6_txoptions *opt, int ipfragok)
164 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 struct in6_addr *first_hop = &fl->fl6_dst;
166 struct dst_entry *dst = skb->dst;
167 struct ipv6hdr *hdr;
168 u8 proto = fl->proto;
169 int seg_len = skb->len;
170 int hlimit, tclass;
171 u32 mtu;
173 if (opt) {
174 int head_room;
176 /* First: exthdrs may take lots of space (~8K for now)
177 MAX_HEADER is not enough.
179 head_room = opt->opt_nflen + opt->opt_flen;
180 seg_len += head_room;
181 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
183 if (skb_headroom(skb) < head_room) {
184 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185 kfree_skb(skb);
186 skb = skb2;
187 if (skb == NULL) {
188 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189 return -ENOBUFS;
191 if (sk)
192 skb_set_owner_w(skb, sk);
194 if (opt->opt_flen)
195 ipv6_push_frag_opts(skb, opt, &proto);
196 if (opt->opt_nflen)
197 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
200 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
203 * Fill in the IPv6 header
206 hlimit = -1;
207 if (np)
208 hlimit = np->hop_limit;
209 if (hlimit < 0)
210 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 if (hlimit < 0)
212 hlimit = ipv6_get_hoplimit(dst->dev);
214 tclass = -1;
215 if (np)
216 tclass = np->tclass;
217 if (tclass < 0)
218 tclass = 0;
220 *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
222 hdr->payload_len = htons(seg_len);
223 hdr->nexthdr = proto;
224 hdr->hop_limit = hlimit;
226 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 ipv6_addr_copy(&hdr->daddr, first_hop);
229 skb->priority = sk->sk_priority;
231 mtu = dst_mtu(dst);
232 if ((skb->len <= mtu) || ipfragok) {
233 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
234 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
235 dst_output);
238 if (net_ratelimit())
239 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
240 skb->dev = dst->dev;
241 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
243 kfree_skb(skb);
244 return -EMSGSIZE;
248 * To avoid extra problems ND packets are send through this
249 * routine. It's code duplication but I really want to avoid
250 * extra checks since ipv6_build_header is used by TCP (which
251 * is for us performance critical)
254 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255 struct in6_addr *saddr, struct in6_addr *daddr,
256 int proto, int len)
258 struct ipv6_pinfo *np = inet6_sk(sk);
259 struct ipv6hdr *hdr;
260 int totlen;
262 skb->protocol = htons(ETH_P_IPV6);
263 skb->dev = dev;
265 totlen = len + sizeof(struct ipv6hdr);
267 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
268 skb->nh.ipv6h = hdr;
270 *(u32*)hdr = htonl(0x60000000);
272 hdr->payload_len = htons(len);
273 hdr->nexthdr = proto;
274 hdr->hop_limit = np->hop_limit;
276 ipv6_addr_copy(&hdr->saddr, saddr);
277 ipv6_addr_copy(&hdr->daddr, daddr);
279 return 0;
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
284 struct ip6_ra_chain *ra;
285 struct sock *last = NULL;
287 read_lock(&ip6_ra_lock);
288 for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 struct sock *sk = ra->sk;
290 if (sk && ra->sel == sel &&
291 (!sk->sk_bound_dev_if ||
292 sk->sk_bound_dev_if == skb->dev->ifindex)) {
293 if (last) {
294 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295 if (skb2)
296 rawv6_rcv(last, skb2);
298 last = sk;
302 if (last) {
303 rawv6_rcv(last, skb);
304 read_unlock(&ip6_ra_lock);
305 return 1;
307 read_unlock(&ip6_ra_lock);
308 return 0;
311 static inline int ip6_forward_finish(struct sk_buff *skb)
313 return dst_output(skb);
316 int ip6_forward(struct sk_buff *skb)
318 struct dst_entry *dst = skb->dst;
319 struct ipv6hdr *hdr = skb->nh.ipv6h;
320 struct inet6_skb_parm *opt = IP6CB(skb);
322 if (ipv6_devconf.forwarding == 0)
323 goto error;
325 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
327 goto drop;
330 skb->ip_summed = CHECKSUM_NONE;
333 * We DO NOT make any processing on
334 * RA packets, pushing them to user level AS IS
335 * without ane WARRANTY that application will be able
336 * to interpret them. The reason is that we
337 * cannot make anything clever here.
339 * We are not end-node, so that if packet contains
340 * AH/ESP, we cannot make anything.
341 * Defragmentation also would be mistake, RA packets
342 * cannot be fragmented, because there is no warranty
343 * that different fragments will go along one path. --ANK
345 if (opt->ra) {
346 u8 *ptr = skb->nh.raw + opt->ra;
347 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
348 return 0;
352 * check and decrement ttl
354 if (hdr->hop_limit <= 1) {
355 /* Force OUTPUT device used as source address */
356 skb->dev = dst->dev;
357 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
358 0, skb->dev);
360 kfree_skb(skb);
361 return -ETIMEDOUT;
364 if (!xfrm6_route_forward(skb)) {
365 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
366 goto drop;
368 dst = skb->dst;
370 /* IPv6 specs say nothing about it, but it is clear that we cannot
371 send redirects to source routed frames.
373 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374 struct in6_addr *target = NULL;
375 struct rt6_info *rt;
376 struct neighbour *n = dst->neighbour;
379 * incoming and outgoing devices are the same
380 * send a redirect.
383 rt = (struct rt6_info *) dst;
384 if ((rt->rt6i_flags & RTF_GATEWAY))
385 target = (struct in6_addr*)&n->primary_key;
386 else
387 target = &hdr->daddr;
389 /* Limit redirects both by destination (here)
390 and by source (inside ndisc_send_redirect)
392 if (xrlim_allow(dst, 1*HZ))
393 ndisc_send_redirect(skb, n, target);
394 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395 |IPV6_ADDR_LINKLOCAL)) {
396 /* This check is security critical. */
397 goto error;
400 if (skb->len > dst_mtu(dst)) {
401 /* Again, force OUTPUT device used as source address */
402 skb->dev = dst->dev;
403 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
406 kfree_skb(skb);
407 return -EMSGSIZE;
410 if (skb_cow(skb, dst->dev->hard_header_len)) {
411 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
412 goto drop;
415 hdr = skb->nh.ipv6h;
417 /* Mangling hops number delayed to point after skb COW */
419 hdr->hop_limit--;
421 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
424 error:
425 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
426 drop:
427 kfree_skb(skb);
428 return -EINVAL;
431 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
433 to->pkt_type = from->pkt_type;
434 to->priority = from->priority;
435 to->protocol = from->protocol;
436 dst_release(to->dst);
437 to->dst = dst_clone(from->dst);
438 to->dev = from->dev;
440 #ifdef CONFIG_NET_SCHED
441 to->tc_index = from->tc_index;
442 #endif
443 #ifdef CONFIG_NETFILTER
444 to->nfmark = from->nfmark;
445 /* Connection association is same as pre-frag packet */
446 nf_conntrack_put(to->nfct);
447 to->nfct = from->nfct;
448 nf_conntrack_get(to->nfct);
449 to->nfctinfo = from->nfctinfo;
450 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451 nf_conntrack_put_reasm(to->nfct_reasm);
452 to->nfct_reasm = from->nfct_reasm;
453 nf_conntrack_get_reasm(to->nfct_reasm);
454 #endif
455 #ifdef CONFIG_BRIDGE_NETFILTER
456 nf_bridge_put(to->nf_bridge);
457 to->nf_bridge = from->nf_bridge;
458 nf_bridge_get(to->nf_bridge);
459 #endif
460 #endif
463 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
465 u16 offset = sizeof(struct ipv6hdr);
466 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467 unsigned int packet_len = skb->tail - skb->nh.raw;
468 int found_rhdr = 0;
469 *nexthdr = &skb->nh.ipv6h->nexthdr;
471 while (offset + 1 <= packet_len) {
473 switch (**nexthdr) {
475 case NEXTHDR_HOP:
476 case NEXTHDR_ROUTING:
477 case NEXTHDR_DEST:
478 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480 offset += ipv6_optlen(exthdr);
481 *nexthdr = &exthdr->nexthdr;
482 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
483 break;
484 default :
485 return offset;
489 return offset;
492 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
494 struct net_device *dev;
495 struct sk_buff *frag;
496 struct rt6_info *rt = (struct rt6_info*)skb->dst;
497 struct ipv6hdr *tmp_hdr;
498 struct frag_hdr *fh;
499 unsigned int mtu, hlen, left, len;
500 u32 frag_id = 0;
501 int ptr, offset = 0, err=0;
502 u8 *prevhdr, nexthdr = 0;
504 dev = rt->u.dst.dev;
505 hlen = ip6_find_1stfragopt(skb, &prevhdr);
506 nexthdr = *prevhdr;
508 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
510 if (skb_shinfo(skb)->frag_list) {
511 int first_len = skb_pagelen(skb);
513 if (first_len - hlen > mtu ||
514 ((first_len - hlen) & 7) ||
515 skb_cloned(skb))
516 goto slow_path;
518 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
519 /* Correct geometry. */
520 if (frag->len > mtu ||
521 ((frag->len & 7) && frag->next) ||
522 skb_headroom(frag) < hlen)
523 goto slow_path;
525 /* Partially cloned skb? */
526 if (skb_shared(frag))
527 goto slow_path;
529 BUG_ON(frag->sk);
530 if (skb->sk) {
531 sock_hold(skb->sk);
532 frag->sk = skb->sk;
533 frag->destructor = sock_wfree;
534 skb->truesize -= frag->truesize;
538 err = 0;
539 offset = 0;
540 frag = skb_shinfo(skb)->frag_list;
541 skb_shinfo(skb)->frag_list = NULL;
542 /* BUILD HEADER */
544 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
545 if (!tmp_hdr) {
546 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
547 return -ENOMEM;
550 *prevhdr = NEXTHDR_FRAGMENT;
551 memcpy(tmp_hdr, skb->nh.raw, hlen);
552 __skb_pull(skb, hlen);
553 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
554 skb->nh.raw = __skb_push(skb, hlen);
555 memcpy(skb->nh.raw, tmp_hdr, hlen);
557 ipv6_select_ident(skb, fh);
558 fh->nexthdr = nexthdr;
559 fh->reserved = 0;
560 fh->frag_off = htons(IP6_MF);
561 frag_id = fh->identification;
563 first_len = skb_pagelen(skb);
564 skb->data_len = first_len - skb_headlen(skb);
565 skb->len = first_len;
566 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
569 for (;;) {
570 /* Prepare header of the next frame,
571 * before previous one went down. */
572 if (frag) {
573 frag->ip_summed = CHECKSUM_NONE;
574 frag->h.raw = frag->data;
575 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
576 frag->nh.raw = __skb_push(frag, hlen);
577 memcpy(frag->nh.raw, tmp_hdr, hlen);
578 offset += skb->len - hlen - sizeof(struct frag_hdr);
579 fh->nexthdr = nexthdr;
580 fh->reserved = 0;
581 fh->frag_off = htons(offset);
582 if (frag->next != NULL)
583 fh->frag_off |= htons(IP6_MF);
584 fh->identification = frag_id;
585 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
586 ip6_copy_metadata(frag, skb);
589 err = output(skb);
590 if (err || !frag)
591 break;
593 skb = frag;
594 frag = skb->next;
595 skb->next = NULL;
598 kfree(tmp_hdr);
600 if (err == 0) {
601 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
602 return 0;
605 while (frag) {
606 skb = frag->next;
607 kfree_skb(frag);
608 frag = skb;
611 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
612 return err;
615 slow_path:
616 left = skb->len - hlen; /* Space per frame */
617 ptr = hlen; /* Where to start from */
620 * Fragment the datagram.
623 *prevhdr = NEXTHDR_FRAGMENT;
626 * Keep copying data until we run out.
628 while(left > 0) {
629 len = left;
630 /* IF: it doesn't fit, use 'mtu' - the data space left */
631 if (len > mtu)
632 len = mtu;
633 /* IF: we are not sending upto and including the packet end
634 then align the next start on an eight byte boundary */
635 if (len < left) {
636 len &= ~7;
639 * Allocate buffer.
642 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
643 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
644 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
645 err = -ENOMEM;
646 goto fail;
650 * Set up data on packet
653 ip6_copy_metadata(frag, skb);
654 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
655 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
656 frag->nh.raw = frag->data;
657 fh = (struct frag_hdr*)(frag->data + hlen);
658 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
661 * Charge the memory for the fragment to any owner
662 * it might possess
664 if (skb->sk)
665 skb_set_owner_w(frag, skb->sk);
668 * Copy the packet header into the new buffer.
670 memcpy(frag->nh.raw, skb->data, hlen);
673 * Build fragment header.
675 fh->nexthdr = nexthdr;
676 fh->reserved = 0;
677 if (!frag_id) {
678 ipv6_select_ident(skb, fh);
679 frag_id = fh->identification;
680 } else
681 fh->identification = frag_id;
684 * Copy a block of the IP datagram.
686 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
687 BUG();
688 left -= len;
690 fh->frag_off = htons(offset);
691 if (left > 0)
692 fh->frag_off |= htons(IP6_MF);
693 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
695 ptr += len;
696 offset += len;
699 * Put this fragment into the sending queue.
702 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
704 err = output(frag);
705 if (err)
706 goto fail;
708 kfree_skb(skb);
709 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
710 return err;
712 fail:
713 kfree_skb(skb);
714 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
715 return err;
718 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
720 int err = 0;
722 *dst = NULL;
723 if (sk) {
724 struct ipv6_pinfo *np = inet6_sk(sk);
726 *dst = sk_dst_check(sk, np->dst_cookie);
727 if (*dst) {
728 struct rt6_info *rt = (struct rt6_info*)*dst;
730 /* Yes, checking route validity in not connected
731 case is not very simple. Take into account,
732 that we do not support routing by source, TOS,
733 and MSG_DONTROUTE --ANK (980726)
735 1. If route was host route, check that
736 cached destination is current.
737 If it is network route, we still may
738 check its validity using saved pointer
739 to the last used address: daddr_cache.
740 We do not want to save whole address now,
741 (because main consumer of this service
742 is tcp, which has not this problem),
743 so that the last trick works only on connected
744 sockets.
745 2. oif also should be the same.
748 if (((rt->rt6i_dst.plen != 128 ||
749 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
750 && (np->daddr_cache == NULL ||
751 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
752 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
753 dst_release(*dst);
754 *dst = NULL;
759 if (*dst == NULL)
760 *dst = ip6_route_output(sk, fl);
762 if ((err = (*dst)->error))
763 goto out_err_release;
765 if (ipv6_addr_any(&fl->fl6_src)) {
766 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
768 if (err)
769 goto out_err_release;
772 return 0;
774 out_err_release:
775 dst_release(*dst);
776 *dst = NULL;
777 return err;
780 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
782 static inline int ip6_ufo_append_data(struct sock *sk,
783 int getfrag(void *from, char *to, int offset, int len,
784 int odd, struct sk_buff *skb),
785 void *from, int length, int hh_len, int fragheaderlen,
786 int transhdrlen, int mtu,unsigned int flags)
789 struct sk_buff *skb;
790 int err;
792 /* There is support for UDP large send offload by network
793 * device, so create one single skb packet containing complete
794 * udp datagram
796 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
797 skb = sock_alloc_send_skb(sk,
798 hh_len + fragheaderlen + transhdrlen + 20,
799 (flags & MSG_DONTWAIT), &err);
800 if (skb == NULL)
801 return -ENOMEM;
803 /* reserve space for Hardware header */
804 skb_reserve(skb, hh_len);
806 /* create space for UDP/IP header */
807 skb_put(skb,fragheaderlen + transhdrlen);
809 /* initialize network header pointer */
810 skb->nh.raw = skb->data;
812 /* initialize protocol header pointer */
813 skb->h.raw = skb->data + fragheaderlen;
815 skb->ip_summed = CHECKSUM_HW;
816 skb->csum = 0;
817 sk->sk_sndmsg_off = 0;
820 err = skb_append_datato_frags(sk,skb, getfrag, from,
821 (length - transhdrlen));
822 if (!err) {
823 struct frag_hdr fhdr;
825 /* specify the length of each IP datagram fragment*/
826 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
827 sizeof(struct frag_hdr);
828 ipv6_select_ident(skb, &fhdr);
829 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
830 __skb_queue_tail(&sk->sk_write_queue, skb);
832 return 0;
834 /* There is not enough support do UPD LSO,
835 * so follow normal path
837 kfree_skb(skb);
839 return err;
842 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
843 int offset, int len, int odd, struct sk_buff *skb),
844 void *from, int length, int transhdrlen,
845 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
846 struct rt6_info *rt, unsigned int flags)
848 struct inet_sock *inet = inet_sk(sk);
849 struct ipv6_pinfo *np = inet6_sk(sk);
850 struct sk_buff *skb;
851 unsigned int maxfraglen, fragheaderlen;
852 int exthdrlen;
853 int hh_len;
854 int mtu;
855 int copy;
856 int err;
857 int offset = 0;
858 int csummode = CHECKSUM_NONE;
860 if (flags&MSG_PROBE)
861 return 0;
862 if (skb_queue_empty(&sk->sk_write_queue)) {
864 * setup for corking
866 if (opt) {
867 if (np->cork.opt == NULL) {
868 np->cork.opt = kmalloc(opt->tot_len,
869 sk->sk_allocation);
870 if (unlikely(np->cork.opt == NULL))
871 return -ENOBUFS;
872 } else if (np->cork.opt->tot_len < opt->tot_len) {
873 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
874 return -EINVAL;
876 memcpy(np->cork.opt, opt, opt->tot_len);
877 inet->cork.flags |= IPCORK_OPT;
878 /* need source address above miyazawa*/
880 dst_hold(&rt->u.dst);
881 np->cork.rt = rt;
882 inet->cork.fl = *fl;
883 np->cork.hop_limit = hlimit;
884 np->cork.tclass = tclass;
885 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
886 if (dst_allfrag(rt->u.dst.path))
887 inet->cork.flags |= IPCORK_ALLFRAG;
888 inet->cork.length = 0;
889 sk->sk_sndmsg_page = NULL;
890 sk->sk_sndmsg_off = 0;
891 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
892 length += exthdrlen;
893 transhdrlen += exthdrlen;
894 } else {
895 rt = np->cork.rt;
896 fl = &inet->cork.fl;
897 if (inet->cork.flags & IPCORK_OPT)
898 opt = np->cork.opt;
899 transhdrlen = 0;
900 exthdrlen = 0;
901 mtu = inet->cork.fragsize;
904 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
906 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
907 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
909 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
910 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
911 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
912 return -EMSGSIZE;
917 * Let's try using as much space as possible.
918 * Use MTU if total length of the message fits into the MTU.
919 * Otherwise, we need to reserve fragment header and
920 * fragment alignment (= 8-15 octects, in total).
922 * Note that we may need to "move" the data from the tail of
923 * of the buffer to the new fragment when we split
924 * the message.
926 * FIXME: It may be fragmented into multiple chunks
927 * at once if non-fragmentable extension headers
928 * are too large.
929 * --yoshfuji
932 inet->cork.length += length;
933 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
934 (rt->u.dst.dev->features & NETIF_F_UFO)) {
936 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
937 fragheaderlen, transhdrlen, mtu, flags))
938 goto error;
940 return 0;
943 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
944 goto alloc_new_skb;
946 while (length > 0) {
947 /* Check if the remaining data fits into current packet. */
948 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
949 if (copy < length)
950 copy = maxfraglen - skb->len;
952 if (copy <= 0) {
953 char *data;
954 unsigned int datalen;
955 unsigned int fraglen;
956 unsigned int fraggap;
957 unsigned int alloclen;
958 struct sk_buff *skb_prev;
959 alloc_new_skb:
960 skb_prev = skb;
962 /* There's no room in the current skb */
963 if (skb_prev)
964 fraggap = skb_prev->len - maxfraglen;
965 else
966 fraggap = 0;
969 * If remaining data exceeds the mtu,
970 * we know we need more fragment(s).
972 datalen = length + fraggap;
973 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
974 datalen = maxfraglen - fragheaderlen;
976 fraglen = datalen + fragheaderlen;
977 if ((flags & MSG_MORE) &&
978 !(rt->u.dst.dev->features&NETIF_F_SG))
979 alloclen = mtu;
980 else
981 alloclen = datalen + fragheaderlen;
984 * The last fragment gets additional space at tail.
985 * Note: we overallocate on fragments with MSG_MODE
986 * because we have no idea if we're the last one.
988 if (datalen == length + fraggap)
989 alloclen += rt->u.dst.trailer_len;
992 * We just reserve space for fragment header.
993 * Note: this may be overallocation if the message
994 * (without MSG_MORE) fits into the MTU.
996 alloclen += sizeof(struct frag_hdr);
998 if (transhdrlen) {
999 skb = sock_alloc_send_skb(sk,
1000 alloclen + hh_len,
1001 (flags & MSG_DONTWAIT), &err);
1002 } else {
1003 skb = NULL;
1004 if (atomic_read(&sk->sk_wmem_alloc) <=
1005 2 * sk->sk_sndbuf)
1006 skb = sock_wmalloc(sk,
1007 alloclen + hh_len, 1,
1008 sk->sk_allocation);
1009 if (unlikely(skb == NULL))
1010 err = -ENOBUFS;
1012 if (skb == NULL)
1013 goto error;
1015 * Fill in the control structures
1017 skb->ip_summed = csummode;
1018 skb->csum = 0;
1019 /* reserve for fragmentation */
1020 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1023 * Find where to start putting bytes
1025 data = skb_put(skb, fraglen);
1026 skb->nh.raw = data + exthdrlen;
1027 data += fragheaderlen;
1028 skb->h.raw = data + exthdrlen;
1030 if (fraggap) {
1031 skb->csum = skb_copy_and_csum_bits(
1032 skb_prev, maxfraglen,
1033 data + transhdrlen, fraggap, 0);
1034 skb_prev->csum = csum_sub(skb_prev->csum,
1035 skb->csum);
1036 data += fraggap;
1037 skb_trim(skb_prev, maxfraglen);
1039 copy = datalen - transhdrlen - fraggap;
1040 if (copy < 0) {
1041 err = -EINVAL;
1042 kfree_skb(skb);
1043 goto error;
1044 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1045 err = -EFAULT;
1046 kfree_skb(skb);
1047 goto error;
1050 offset += copy;
1051 length -= datalen - fraggap;
1052 transhdrlen = 0;
1053 exthdrlen = 0;
1054 csummode = CHECKSUM_NONE;
1057 * Put the packet on the pending queue
1059 __skb_queue_tail(&sk->sk_write_queue, skb);
1060 continue;
1063 if (copy > length)
1064 copy = length;
1066 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1067 unsigned int off;
1069 off = skb->len;
1070 if (getfrag(from, skb_put(skb, copy),
1071 offset, copy, off, skb) < 0) {
1072 __skb_trim(skb, off);
1073 err = -EFAULT;
1074 goto error;
1076 } else {
1077 int i = skb_shinfo(skb)->nr_frags;
1078 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1079 struct page *page = sk->sk_sndmsg_page;
1080 int off = sk->sk_sndmsg_off;
1081 unsigned int left;
1083 if (page && (left = PAGE_SIZE - off) > 0) {
1084 if (copy >= left)
1085 copy = left;
1086 if (page != frag->page) {
1087 if (i == MAX_SKB_FRAGS) {
1088 err = -EMSGSIZE;
1089 goto error;
1091 get_page(page);
1092 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1093 frag = &skb_shinfo(skb)->frags[i];
1095 } else if(i < MAX_SKB_FRAGS) {
1096 if (copy > PAGE_SIZE)
1097 copy = PAGE_SIZE;
1098 page = alloc_pages(sk->sk_allocation, 0);
1099 if (page == NULL) {
1100 err = -ENOMEM;
1101 goto error;
1103 sk->sk_sndmsg_page = page;
1104 sk->sk_sndmsg_off = 0;
1106 skb_fill_page_desc(skb, i, page, 0, 0);
1107 frag = &skb_shinfo(skb)->frags[i];
1108 skb->truesize += PAGE_SIZE;
1109 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1110 } else {
1111 err = -EMSGSIZE;
1112 goto error;
1114 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1115 err = -EFAULT;
1116 goto error;
1118 sk->sk_sndmsg_off += copy;
1119 frag->size += copy;
1120 skb->len += copy;
1121 skb->data_len += copy;
1123 offset += copy;
1124 length -= copy;
1126 return 0;
1127 error:
1128 inet->cork.length -= length;
1129 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1130 return err;
1133 int ip6_push_pending_frames(struct sock *sk)
1135 struct sk_buff *skb, *tmp_skb;
1136 struct sk_buff **tail_skb;
1137 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1138 struct inet_sock *inet = inet_sk(sk);
1139 struct ipv6_pinfo *np = inet6_sk(sk);
1140 struct ipv6hdr *hdr;
1141 struct ipv6_txoptions *opt = np->cork.opt;
1142 struct rt6_info *rt = np->cork.rt;
1143 struct flowi *fl = &inet->cork.fl;
1144 unsigned char proto = fl->proto;
1145 int err = 0;
1147 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1148 goto out;
1149 tail_skb = &(skb_shinfo(skb)->frag_list);
1151 /* move skb->data to ip header from ext header */
1152 if (skb->data < skb->nh.raw)
1153 __skb_pull(skb, skb->nh.raw - skb->data);
1154 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1155 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1156 *tail_skb = tmp_skb;
1157 tail_skb = &(tmp_skb->next);
1158 skb->len += tmp_skb->len;
1159 skb->data_len += tmp_skb->len;
1160 skb->truesize += tmp_skb->truesize;
1161 __sock_put(tmp_skb->sk);
1162 tmp_skb->destructor = NULL;
1163 tmp_skb->sk = NULL;
1166 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1167 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1168 if (opt && opt->opt_flen)
1169 ipv6_push_frag_opts(skb, opt, &proto);
1170 if (opt && opt->opt_nflen)
1171 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1173 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1175 *(u32*)hdr = fl->fl6_flowlabel |
1176 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1178 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1179 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1180 else
1181 hdr->payload_len = 0;
1182 hdr->hop_limit = np->cork.hop_limit;
1183 hdr->nexthdr = proto;
1184 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1185 ipv6_addr_copy(&hdr->daddr, final_dst);
1187 skb->priority = sk->sk_priority;
1189 skb->dst = dst_clone(&rt->u.dst);
1190 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1191 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1192 if (err) {
1193 if (err > 0)
1194 err = np->recverr ? net_xmit_errno(err) : 0;
1195 if (err)
1196 goto error;
1199 out:
1200 inet->cork.flags &= ~IPCORK_OPT;
1201 kfree(np->cork.opt);
1202 np->cork.opt = NULL;
1203 if (np->cork.rt) {
1204 dst_release(&np->cork.rt->u.dst);
1205 np->cork.rt = NULL;
1206 inet->cork.flags &= ~IPCORK_ALLFRAG;
1208 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1209 return err;
1210 error:
1211 goto out;
1214 void ip6_flush_pending_frames(struct sock *sk)
1216 struct inet_sock *inet = inet_sk(sk);
1217 struct ipv6_pinfo *np = inet6_sk(sk);
1218 struct sk_buff *skb;
1220 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1221 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1222 kfree_skb(skb);
1225 inet->cork.flags &= ~IPCORK_OPT;
1227 kfree(np->cork.opt);
1228 np->cork.opt = NULL;
1229 if (np->cork.rt) {
1230 dst_release(&np->cork.rt->u.dst);
1231 np->cork.rt = NULL;
1232 inet->cork.flags &= ~IPCORK_ALLFRAG;
1234 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));