net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 107         kfree_skb(skb);
 108         return -EINVAL;
 109
 110 }
 111
 112 /* dev_loopback_xmit for use with netfilter. */
 113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 114 {
 115         skb_reset_mac_header(newskb);
 116         __skb_pull(newskb, skb_network_offset(newskb));
 117         newskb->pkt_type = PACKET_LOOPBACK;
 118         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 119         BUG_TRAP(newskb->dst);
 120
 121         netif_rx(newskb);
 122         return 0;
 123 }
 124
 125
 126 static int ip6_output2(struct sk_buff *skb)
 127 {
 128         struct dst_entry *dst = skb->dst;
 129         struct net_device *dev = dst->dev;
 130
 131         skb->protocol = htons(ETH_P_IPV6);
 132         skb->dev = dev;
 133
 134         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 135                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 136                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 137
 138                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 139                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 140                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                          &ipv6_hdr(skb)->saddr))) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 177         if (unlikely(idev->cnf.disable_ipv6)) {
 178                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 179                 kfree_skb(skb);
 180                 return 0;
 181         }
 182
 183         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 184                                 dst_allfrag(skb->dst))
 185                 return ip6_fragment(skb, ip6_output2);
 186         else
 187                 return ip6_output2(skb);
 188 }
 189
 190 /*
 191  *      xmit an sk_buff (used by TCP)
 192  */
 193
 194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 195              struct ipv6_txoptions *opt, int ipfragok)
 196 {
 197         struct ipv6_pinfo *np = inet6_sk(sk);
 198         struct in6_addr *first_hop = &fl->fl6_dst;
 199         struct dst_entry *dst = skb->dst;
 200         struct ipv6hdr *hdr;
 201         u8  proto = fl->proto;
 202         int seg_len = skb->len;
 203         int hlimit, tclass;
 204         u32 mtu;
 205
 206         if (opt) {
 207                 unsigned int head_room;
 208
 209                 /* First: exthdrs may take lots of space (~8K for now)
 210                    MAX_HEADER is not enough.
 211                  */
 212                 head_room = opt->opt_nflen + opt->opt_flen;
 213                 seg_len += head_room;
 214                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 215
 216                 if (skb_headroom(skb) < head_room) {
 217                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 218                         if (skb2 == NULL) {
 219                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 220                                               IPSTATS_MIB_OUTDISCARDS);
 221                                 kfree_skb(skb);
 222                                 return -ENOBUFS;
 223                         }
 224                         kfree_skb(skb);
 225                         skb = skb2;
 226                         if (sk)
 227                                 skb_set_owner_w(skb, sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 233         }
 234
 235         skb_push(skb, sizeof(struct ipv6hdr));
 236         skb_reset_network_header(skb);
 237         hdr = ipv6_hdr(skb);
 238
 239         /*
 240          *      Fill in the IPv6 header
 241          */
 242
 243         hlimit = -1;
 244         if (np)
 245                 hlimit = np->hop_limit;
 246         if (hlimit < 0)
 247                 hlimit = ip6_dst_hoplimit(dst);
 248
 249         tclass = -1;
 250         if (np)
 251                 tclass = np->tclass;
 252         if (tclass < 0)
 253                 tclass = 0;
 254
 255         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 256
 257         hdr->payload_len = htons(seg_len);
 258         hdr->nexthdr = proto;
 259         hdr->hop_limit = hlimit;
 260
 261         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 262         ipv6_addr_copy(&hdr->daddr, first_hop);
 263
 264         skb->priority = sk->sk_priority;
 265         skb->mark = sk->sk_mark;
 266
 267         mtu = dst_mtu(dst);
 268         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 269                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 270                               IPSTATS_MIB_OUTREQUESTS);
 271                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 272                                 dst_output);
 273         }
 274
 275         if (net_ratelimit())
 276                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 277         skb->dev = dst->dev;
 278         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 279         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 280         kfree_skb(skb);
 281         return -EMSGSIZE;
 282 }
 283
 284 EXPORT_SYMBOL(ip6_xmit);
 285
 286 /*
 287  *      To avoid extra problems ND packets are send through this
 288  *      routine. It's code duplication but I really want to avoid
 289  *      extra checks since ipv6_build_header is used by TCP (which
 290  *      is for us performance critical)
 291  */
 292
 293 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 294                const struct in6_addr *saddr, const struct in6_addr *daddr,
 295                int proto, int len)
 296 {
 297         struct ipv6_pinfo *np = inet6_sk(sk);
 298         struct ipv6hdr *hdr;
 299         int totlen;
 300
 301         skb->protocol = htons(ETH_P_IPV6);
 302         skb->dev = dev;
 303
 304         totlen = len + sizeof(struct ipv6hdr);
 305
 306         skb_reset_network_header(skb);
 307         skb_put(skb, sizeof(struct ipv6hdr));
 308         hdr = ipv6_hdr(skb);
 309
 310         *(__be32*)hdr = htonl(0x60000000);
 311
 312         hdr->payload_len = htons(len);
 313         hdr->nexthdr = proto;
 314         hdr->hop_limit = np->hop_limit;
 315
 316         ipv6_addr_copy(&hdr->saddr, saddr);
 317         ipv6_addr_copy(&hdr->daddr, daddr);
 318
 319         return 0;
 320 }
 321
 322 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 323 {
 324         struct ip6_ra_chain *ra;
 325         struct sock *last = NULL;
 326
 327         read_lock(&ip6_ra_lock);
 328         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 329                 struct sock *sk = ra->sk;
 330                 if (sk && ra->sel == sel &&
 331                     (!sk->sk_bound_dev_if ||
 332                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 333                         if (last) {
 334                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 335                                 if (skb2)
 336                                         rawv6_rcv(last, skb2);
 337                         }
 338                         last = sk;
 339                 }
 340         }
 341
 342         if (last) {
 343                 rawv6_rcv(last, skb);
 344                 read_unlock(&ip6_ra_lock);
 345                 return 1;
 346         }
 347         read_unlock(&ip6_ra_lock);
 348         return 0;
 349 }
 350
 351 static int ip6_forward_proxy_check(struct sk_buff *skb)
 352 {
 353         struct ipv6hdr *hdr = ipv6_hdr(skb);
 354         u8 nexthdr = hdr->nexthdr;
 355         int offset;
 356
 357         if (ipv6_ext_hdr(nexthdr)) {
 358                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 359                 if (offset < 0)
 360                         return 0;
 361         } else
 362                 offset = sizeof(struct ipv6hdr);
 363
 364         if (nexthdr == IPPROTO_ICMPV6) {
 365                 struct icmp6hdr *icmp6;
 366
 367                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 368                                          offset + 1 - skb->data)))
 369                         return 0;
 370
 371                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 372
 373                 switch (icmp6->icmp6_type) {
 374                 case NDISC_ROUTER_SOLICITATION:
 375                 case NDISC_ROUTER_ADVERTISEMENT:
 376                 case NDISC_NEIGHBOUR_SOLICITATION:
 377                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 378                 case NDISC_REDIRECT:
 379                         /* For reaction involving unicast neighbor discovery
 380                          * message destined to the proxied address, pass it to
 381                          * input function.
 382                          */
 383                         return 1;
 384                 default:
 385                         break;
 386                 }
 387         }
 388
 389         /*
 390          * The proxying router can't forward traffic sent to a link-local
 391          * address, so signal the sender and discard the packet. This
 392          * behavior is clarified by the MIPv6 specification.
 393          */
 394         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 395                 dst_link_failure(skb);
 396                 return -1;
 397         }
 398
 399         return 0;
 400 }
 401
 402 static inline int ip6_forward_finish(struct sk_buff *skb)
 403 {
 404         return dst_output(skb);
 405 }
 406
 407 int ip6_forward(struct sk_buff *skb)
 408 {
 409         struct dst_entry *dst = skb->dst;
 410         struct ipv6hdr *hdr = ipv6_hdr(skb);
 411         struct inet6_skb_parm *opt = IP6CB(skb);
 412         struct net *net = dev_net(dst->dev);
 413
 414         if (net->ipv6.devconf_all->forwarding == 0)
 415                 goto error;
 416
 417         if (skb_warn_if_lro(skb))
 418                 goto drop;
 419
 420         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 421                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 422                 goto drop;
 423         }
 424
 425         skb_forward_csum(skb);
 426
 427         /*
 428          *      We DO NOT make any processing on
 429          *      RA packets, pushing them to user level AS IS
 430          *      without ane WARRANTY that application will be able
 431          *      to interpret them. The reason is that we
 432          *      cannot make anything clever here.
 433          *
 434          *      We are not end-node, so that if packet contains
 435          *      AH/ESP, we cannot make anything.
 436          *      Defragmentation also would be mistake, RA packets
 437          *      cannot be fragmented, because there is no warranty
 438          *      that different fragments will go along one path. --ANK
 439          */
 440         if (opt->ra) {
 441                 u8 *ptr = skb_network_header(skb) + opt->ra;
 442                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 443                         return 0;
 444         }
 445
 446         /*
 447          *      check and decrement ttl
 448          */
 449         if (hdr->hop_limit <= 1) {
 450                 /* Force OUTPUT device used as source address */
 451                 skb->dev = dst->dev;
 452                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 453                             0, skb->dev);
 454                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 455
 456                 kfree_skb(skb);
 457                 return -ETIMEDOUT;
 458         }
 459
 460         /* XXX: idev->cnf.proxy_ndp? */
 461         if (net->ipv6.devconf_all->proxy_ndp &&
 462             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 463                 int proxied = ip6_forward_proxy_check(skb);
 464                 if (proxied > 0)
 465                         return ip6_input(skb);
 466                 else if (proxied < 0) {
 467                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 468                         goto drop;
 469                 }
 470         }
 471
 472         if (!xfrm6_route_forward(skb)) {
 473                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 474                 goto drop;
 475         }
 476         dst = skb->dst;
 477
 478         /* IPv6 specs say nothing about it, but it is clear that we cannot
 479            send redirects to source routed frames.
 480            We don't send redirects to frames decapsulated from IPsec.
 481          */
 482         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 483             !skb->sp) {
 484                 struct in6_addr *target = NULL;
 485                 struct rt6_info *rt;
 486                 struct neighbour *n = dst->neighbour;
 487
 488                 /*
 489                  *      incoming and outgoing devices are the same
 490                  *      send a redirect.
 491                  */
 492
 493                 rt = (struct rt6_info *) dst;
 494                 if ((rt->rt6i_flags & RTF_GATEWAY))
 495                         target = (struct in6_addr*)&n->primary_key;
 496                 else
 497                         target = &hdr->daddr;
 498
 499                 /* Limit redirects both by destination (here)
 500                    and by source (inside ndisc_send_redirect)
 501                  */
 502                 if (xrlim_allow(dst, 1*HZ))
 503                         ndisc_send_redirect(skb, n, target);
 504         } else {
 505                 int addrtype = ipv6_addr_type(&hdr->saddr);
 506
 507                 /* This check is security critical. */
 508                 if (addrtype == IPV6_ADDR_ANY ||
 509                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 510                         goto error;
 511                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 512                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 513                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 514                         goto error;
 515                 }
 516         }
 517
 518         if (skb->len > dst_mtu(dst)) {
 519                 /* Again, force OUTPUT device used as source address */
 520                 skb->dev = dst->dev;
 521                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 522                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 523                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 524                 kfree_skb(skb);
 525                 return -EMSGSIZE;
 526         }
 527
 528         if (skb_cow(skb, dst->dev->hard_header_len)) {
 529                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 530                 goto drop;
 531         }
 532
 533         hdr = ipv6_hdr(skb);
 534
 535         /* Mangling hops number delayed to point after skb COW */
 536
 537         hdr->hop_limit--;
 538
 539         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 540         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 541                        ip6_forward_finish);
 542
 543 error:
 544         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 545 drop:
 546         kfree_skb(skb);
 547         return -EINVAL;
 548 }
 549
 550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 551 {
 552         to->pkt_type = from->pkt_type;
 553         to->priority = from->priority;
 554         to->protocol = from->protocol;
 555         dst_release(to->dst);
 556         to->dst = dst_clone(from->dst);
 557         to->dev = from->dev;
 558         to->mark = from->mark;
 559
 560 #ifdef CONFIG_NET_SCHED
 561         to->tc_index = from->tc_index;
 562 #endif
 563         nf_copy(to, from);
 564 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 565     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 566         to->nf_trace = from->nf_trace;
 567 #endif
 568         skb_copy_secmark(to, from);
 569 }
 570
 571 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 572 {
 573         u16 offset = sizeof(struct ipv6hdr);
 574         struct ipv6_opt_hdr *exthdr =
 575                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 576         unsigned int packet_len = skb->tail - skb->network_header;
 577         int found_rhdr = 0;
 578         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 579
 580         while (offset + 1 <= packet_len) {
 581
 582                 switch (**nexthdr) {
 583
 584                 case NEXTHDR_HOP:
 585                         break;
 586                 case NEXTHDR_ROUTING:
 587                         found_rhdr = 1;
 588                         break;
 589                 case NEXTHDR_DEST:
 590 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 591                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 592                                 break;
 593 #endif
 594                         if (found_rhdr)
 595                                 return offset;
 596                         break;
 597                 default :
 598                         return offset;
 599                 }
 600
 601                 offset += ipv6_optlen(exthdr);
 602                 *nexthdr = &exthdr->nexthdr;
 603                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 604                                                  offset);
 605         }
 606
 607         return offset;
 608 }
 609
 610 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 611 {
 612         struct net_device *dev;
 613         struct sk_buff *frag;
 614         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 615         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 616         struct ipv6hdr *tmp_hdr;
 617         struct frag_hdr *fh;
 618         unsigned int mtu, hlen, left, len;
 619         __be32 frag_id = 0;
 620         int ptr, offset = 0, err=0;
 621         u8 *prevhdr, nexthdr = 0;
 622
 623         dev = rt->u.dst.dev;
 624         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 625         nexthdr = *prevhdr;
 626
 627         mtu = ip6_skb_dst_mtu(skb);
 628
 629         /* We must not fragment if the socket is set to force MTU discovery
 630          * or if the skb it not generated by a local socket.  (This last
 631          * check should be redundant, but it's free.)
 632          */
 633         if (!skb->local_df) {
 634                 skb->dev = skb->dst->dev;
 635                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 636                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 637                 kfree_skb(skb);
 638                 return -EMSGSIZE;
 639         }
 640
 641         if (np && np->frag_size < mtu) {
 642                 if (np->frag_size)
 643                         mtu = np->frag_size;
 644         }
 645         mtu -= hlen + sizeof(struct frag_hdr);
 646
 647         if (skb_shinfo(skb)->frag_list) {
 648                 int first_len = skb_pagelen(skb);
 649                 int truesizes = 0;
 650
 651                 if (first_len - hlen > mtu ||
 652                     ((first_len - hlen) & 7) ||
 653                     skb_cloned(skb))
 654                         goto slow_path;
 655
 656                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 657                         /* Correct geometry. */
 658                         if (frag->len > mtu ||
 659                             ((frag->len & 7) && frag->next) ||
 660                             skb_headroom(frag) < hlen)
 661                             goto slow_path;
 662
 663                         /* Partially cloned skb? */
 664                         if (skb_shared(frag))
 665                                 goto slow_path;
 666
 667                         BUG_ON(frag->sk);
 668                         if (skb->sk) {
 669                                 sock_hold(skb->sk);
 670                                 frag->sk = skb->sk;
 671                                 frag->destructor = sock_wfree;
 672                                 truesizes += frag->truesize;
 673                         }
 674                 }
 675
 676                 err = 0;
 677                 offset = 0;
 678                 frag = skb_shinfo(skb)->frag_list;
 679                 skb_shinfo(skb)->frag_list = NULL;
 680                 /* BUILD HEADER */
 681
 682                 *prevhdr = NEXTHDR_FRAGMENT;
 683                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 684                 if (!tmp_hdr) {
 685                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 686                         return -ENOMEM;
 687                 }
 688
 689                 __skb_pull(skb, hlen);
 690                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 691                 __skb_push(skb, hlen);
 692                 skb_reset_network_header(skb);
 693                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 694
 695                 ipv6_select_ident(skb, fh);
 696                 fh->nexthdr = nexthdr;
 697                 fh->reserved = 0;
 698                 fh->frag_off = htons(IP6_MF);
 699                 frag_id = fh->identification;
 700
 701                 first_len = skb_pagelen(skb);
 702                 skb->data_len = first_len - skb_headlen(skb);
 703                 skb->truesize -= truesizes;
 704                 skb->len = first_len;
 705                 ipv6_hdr(skb)->payload_len = htons(first_len -
 706                                                    sizeof(struct ipv6hdr));
 707
 708                 dst_hold(&rt->u.dst);
 709
 710                 for (;;) {
 711                         /* Prepare header of the next frame,
 712                          * before previous one went down. */
 713                         if (frag) {
 714                                 frag->ip_summed = CHECKSUM_NONE;
 715                                 skb_reset_transport_header(frag);
 716                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 717                                 __skb_push(frag, hlen);
 718                                 skb_reset_network_header(frag);
 719                                 memcpy(skb_network_header(frag), tmp_hdr,
 720                                        hlen);
 721                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 722                                 fh->nexthdr = nexthdr;
 723                                 fh->reserved = 0;
 724                                 fh->frag_off = htons(offset);
 725                                 if (frag->next != NULL)
 726                                         fh->frag_off |= htons(IP6_MF);
 727                                 fh->identification = frag_id;
 728                                 ipv6_hdr(frag)->payload_len =
 729                                                 htons(frag->len -
 730                                                       sizeof(struct ipv6hdr));
 731                                 ip6_copy_metadata(frag, skb);
 732                         }
 733
 734                         err = output(skb);
 735                         if(!err)
 736                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 737
 738                         if (err || !frag)
 739                                 break;
 740
 741                         skb = frag;
 742                         frag = skb->next;
 743                         skb->next = NULL;
 744                 }
 745
 746                 kfree(tmp_hdr);
 747
 748                 if (err == 0) {
 749                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 750                         dst_release(&rt->u.dst);
 751                         return 0;
 752                 }
 753
 754                 while (frag) {
 755                         skb = frag->next;
 756                         kfree_skb(frag);
 757                         frag = skb;
 758                 }
 759
 760                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 761                 dst_release(&rt->u.dst);
 762                 return err;
 763         }
 764
 765 slow_path:
 766         left = skb->len - hlen;         /* Space per frame */
 767         ptr = hlen;                     /* Where to start from */
 768
 769         /*
 770          *      Fragment the datagram.
 771          */
 772
 773         *prevhdr = NEXTHDR_FRAGMENT;
 774
 775         /*
 776          *      Keep copying data until we run out.
 777          */
 778         while(left > 0) {
 779                 len = left;
 780                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 781                 if (len > mtu)
 782                         len = mtu;
 783                 /* IF: we are not sending upto and including the packet end
 784                    then align the next start on an eight byte boundary */
 785                 if (len < left) {
 786                         len &= ~7;
 787                 }
 788                 /*
 789                  *      Allocate buffer.
 790                  */
 791
 792                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 793                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 794                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 795                                       IPSTATS_MIB_FRAGFAILS);
 796                         err = -ENOMEM;
 797                         goto fail;
 798                 }
 799
 800                 /*
 801                  *      Set up data on packet
 802                  */
 803
 804                 ip6_copy_metadata(frag, skb);
 805                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 806                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 807                 skb_reset_network_header(frag);
 808                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 809                 frag->transport_header = (frag->network_header + hlen +
 810                                           sizeof(struct frag_hdr));
 811
 812                 /*
 813                  *      Charge the memory for the fragment to any owner
 814                  *      it might possess
 815                  */
 816                 if (skb->sk)
 817                         skb_set_owner_w(frag, skb->sk);
 818
 819                 /*
 820                  *      Copy the packet header into the new buffer.
 821                  */
 822                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 823
 824                 /*
 825                  *      Build fragment header.
 826                  */
 827                 fh->nexthdr = nexthdr;
 828                 fh->reserved = 0;
 829                 if (!frag_id) {
 830                         ipv6_select_ident(skb, fh);
 831                         frag_id = fh->identification;
 832                 } else
 833                         fh->identification = frag_id;
 834
 835                 /*
 836                  *      Copy a block of the IP datagram.
 837                  */
 838                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 839                         BUG();
 840                 left -= len;
 841
 842                 fh->frag_off = htons(offset);
 843                 if (left > 0)
 844                         fh->frag_off |= htons(IP6_MF);
 845                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 846                                                     sizeof(struct ipv6hdr));
 847
 848                 ptr += len;
 849                 offset += len;
 850
 851                 /*
 852                  *      Put this fragment into the sending queue.
 853                  */
 854                 err = output(frag);
 855                 if (err)
 856                         goto fail;
 857
 858                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 859         }
 860         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 861                       IPSTATS_MIB_FRAGOKS);
 862         kfree_skb(skb);
 863         return err;
 864
 865 fail:
 866         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 867                       IPSTATS_MIB_FRAGFAILS);
 868         kfree_skb(skb);
 869         return err;
 870 }
 871
 872 static inline int ip6_rt_check(struct rt6key *rt_key,
 873                                struct in6_addr *fl_addr,
 874                                struct in6_addr *addr_cache)
 875 {
 876         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 877                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 878 }
 879
 880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 881                                           struct dst_entry *dst,
 882                                           struct flowi *fl)
 883 {
 884         struct ipv6_pinfo *np = inet6_sk(sk);
 885         struct rt6_info *rt = (struct rt6_info *)dst;
 886
 887         if (!dst)
 888                 goto out;
 889
 890         /* Yes, checking route validity in not connected
 891          * case is not very simple. Take into account,
 892          * that we do not support routing by source, TOS,
 893          * and MSG_DONTROUTE            --ANK (980726)
 894          *
 895          * 1. ip6_rt_check(): If route was host route,
 896          *    check that cached destination is current.
 897          *    If it is network route, we still may
 898          *    check its validity using saved pointer
 899          *    to the last used address: daddr_cache.
 900          *    We do not want to save whole address now,
 901          *    (because main consumer of this service
 902          *    is tcp, which has not this problem),
 903          *    so that the last trick works only on connected
 904          *    sockets.
 905          * 2. oif also should be the same.
 906          */
 907         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 908 #ifdef CONFIG_IPV6_SUBTREES
 909             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 910 #endif
 911             (fl->oif && fl->oif != dst->dev->ifindex)) {
 912                 dst_release(dst);
 913                 dst = NULL;
 914         }
 915
 916 out:
 917         return dst;
 918 }
 919
 920 static int ip6_dst_lookup_tail(struct sock *sk,
 921                                struct dst_entry **dst, struct flowi *fl)
 922 {
 923         int err;
 924         struct net *net = sock_net(sk);
 925
 926         if (*dst == NULL)
 927                 *dst = ip6_route_output(net, sk, fl);
 928
 929         if ((err = (*dst)->error))
 930                 goto out_err_release;
 931
 932         if (ipv6_addr_any(&fl->fl6_src)) {
 933                 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
 934                                          &fl->fl6_dst,
 935                                          sk ? inet6_sk(sk)->srcprefs : 0,
 936                                          &fl->fl6_src);
 937                 if (err)
 938                         goto out_err_release;
 939         }
 940
 941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 942                 /*
 943                  * Here if the dst entry we've looked up
 944                  * has a neighbour entry that is in the INCOMPLETE
 945                  * state and the src address from the flow is
 946                  * marked as OPTIMISTIC, we release the found
 947                  * dst entry and replace it instead with the
 948                  * dst entry of the nexthop router
 949                  */
 950                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
 951                         struct inet6_ifaddr *ifp;
 952                         struct flowi fl_gw;
 953                         int redirect;
 954
 955                         ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 956                                               (*dst)->dev, 1);
 957
 958                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 959                         if (ifp)
 960                                 in6_ifa_put(ifp);
 961
 962                         if (redirect) {
 963                                 /*
 964                                  * We need to get the dst entry for the
 965                                  * default router instead
 966                                  */
 967                                 dst_release(*dst);
 968                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
 969                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 970                                 *dst = ip6_route_output(net, sk, &fl_gw);
 971                                 if ((err = (*dst)->error))
 972                                         goto out_err_release;
 973                         }
 974                 }
 975 #endif
 976
 977         return 0;
 978
 979 out_err_release:
 980         if (err == -ENETUNREACH)
 981                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 982         dst_release(*dst);
 983         *dst = NULL;
 984         return err;
 985 }
 986
 987 /**
 988  *      ip6_dst_lookup - perform route lookup on flow
 989  *      @sk: socket which provides route info
 990  *      @dst: pointer to dst_entry * for result
 991  *      @fl: flow to lookup
 992  *
 993  *      This function performs a route lookup on the given flow.
 994  *
 995  *      It returns zero on success, or a standard errno code on error.
 996  */
 997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 998 {
 999         *dst = NULL;
1000         return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
1004 /**
1005  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *      @sk: socket which provides the dst cache and route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow with the
1011  *      possibility of using the cached route in the socket if it is valid.
1012  *      It will take the socket dst lock when operating on the dst cache.
1013  *      As a result, this function can only be used in process context.
1014  *
1015  *      It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019         *dst = NULL;
1020         if (sk) {
1021                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1023         }
1024
1025         return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030                         int getfrag(void *from, char *to, int offset, int len,
1031                         int odd, struct sk_buff *skb),
1032                         void *from, int length, int hh_len, int fragheaderlen,
1033                         int transhdrlen, int mtu,unsigned int flags)
1034
1035 {
1036         struct sk_buff *skb;
1037         int err;
1038
1039         /* There is support for UDP large send offload by network
1040          * device, so create one single skb packet containing complete
1041          * udp datagram
1042          */
1043         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044                 skb = sock_alloc_send_skb(sk,
1045                         hh_len + fragheaderlen + transhdrlen + 20,
1046                         (flags & MSG_DONTWAIT), &err);
1047                 if (skb == NULL)
1048                         return -ENOMEM;
1049
1050                 /* reserve space for Hardware header */
1051                 skb_reserve(skb, hh_len);
1052
1053                 /* create space for UDP/IP header */
1054                 skb_put(skb,fragheaderlen + transhdrlen);
1055
1056                 /* initialize network header pointer */
1057                 skb_reset_network_header(skb);
1058
1059                 /* initialize protocol header pointer */
1060                 skb->transport_header = skb->network_header + fragheaderlen;
1061
1062                 skb->ip_summed = CHECKSUM_PARTIAL;
1063                 skb->csum = 0;
1064                 sk->sk_sndmsg_off = 0;
1065         }
1066
1067         err = skb_append_datato_frags(sk,skb, getfrag, from,
1068                                       (length - transhdrlen));
1069         if (!err) {
1070                 struct frag_hdr fhdr;
1071
1072                 /* specify the length of each IP datagram fragment*/
1073                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1074                                             sizeof(struct frag_hdr);
1075                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1076                 ipv6_select_ident(skb, &fhdr);
1077                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1078                 __skb_queue_tail(&sk->sk_write_queue, skb);
1079
1080                 return 0;
1081         }
1082         /* There is not enough support do UPD LSO,
1083          * so follow normal path
1084          */
1085         kfree_skb(skb);
1086
1087         return err;
1088 }
1089
1090 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1091         int offset, int len, int odd, struct sk_buff *skb),
1092         void *from, int length, int transhdrlen,
1093         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1094         struct rt6_info *rt, unsigned int flags)
1095 {
1096         struct inet_sock *inet = inet_sk(sk);
1097         struct ipv6_pinfo *np = inet6_sk(sk);
1098         struct sk_buff *skb;
1099         unsigned int maxfraglen, fragheaderlen;
1100         int exthdrlen;
1101         int hh_len;
1102         int mtu;
1103         int copy;
1104         int err;
1105         int offset = 0;
1106         int csummode = CHECKSUM_NONE;
1107
1108         if (flags&MSG_PROBE)
1109                 return 0;
1110         if (skb_queue_empty(&sk->sk_write_queue)) {
1111                 /*
1112                  * setup for corking
1113                  */
1114                 if (opt) {
1115                         if (np->cork.opt == NULL) {
1116                                 np->cork.opt = kmalloc(opt->tot_len,
1117                                                        sk->sk_allocation);
1118                                 if (unlikely(np->cork.opt == NULL))
1119                                         return -ENOBUFS;
1120                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1121                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1122                                 return -EINVAL;
1123                         }
1124                         memcpy(np->cork.opt, opt, opt->tot_len);
1125                         inet->cork.flags |= IPCORK_OPT;
1126                         /* need source address above miyazawa*/
1127                 }
1128                 dst_hold(&rt->u.dst);
1129                 inet->cork.dst = &rt->u.dst;
1130                 inet->cork.fl = *fl;
1131                 np->cork.hop_limit = hlimit;
1132                 np->cork.tclass = tclass;
1133                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1134                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1135                 if (np->frag_size < mtu) {
1136                         if (np->frag_size)
1137                                 mtu = np->frag_size;
1138                 }
1139                 inet->cork.fragsize = mtu;
1140                 if (dst_allfrag(rt->u.dst.path))
1141                         inet->cork.flags |= IPCORK_ALLFRAG;
1142                 inet->cork.length = 0;
1143                 sk->sk_sndmsg_page = NULL;
1144                 sk->sk_sndmsg_off = 0;
1145                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1146                             rt->rt6i_nfheader_len;
1147                 length += exthdrlen;
1148                 transhdrlen += exthdrlen;
1149         } else {
1150                 rt = (struct rt6_info *)inet->cork.dst;
1151                 fl = &inet->cork.fl;
1152                 if (inet->cork.flags & IPCORK_OPT)
1153                         opt = np->cork.opt;
1154                 transhdrlen = 0;
1155                 exthdrlen = 0;
1156                 mtu = inet->cork.fragsize;
1157         }
1158
1159         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1160
1161         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1162                         (opt ? opt->opt_nflen : 0);
1163         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1164
1165         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1166                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1167                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1168                         return -EMSGSIZE;
1169                 }
1170         }
1171
1172         /*
1173          * Let's try using as much space as possible.
1174          * Use MTU if total length of the message fits into the MTU.
1175          * Otherwise, we need to reserve fragment header and
1176          * fragment alignment (= 8-15 octects, in total).
1177          *
1178          * Note that we may need to "move" the data from the tail of
1179          * of the buffer to the new fragment when we split
1180          * the message.
1181          *
1182          * FIXME: It may be fragmented into multiple chunks
1183          *        at once if non-fragmentable extension headers
1184          *        are too large.
1185          * --yoshfuji
1186          */
1187
1188         inet->cork.length += length;
1189         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1190             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1191
1192                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1193                                           fragheaderlen, transhdrlen, mtu,
1194                                           flags);
1195                 if (err)
1196                         goto error;
1197                 return 0;
1198         }
1199
1200         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1201                 goto alloc_new_skb;
1202
1203         while (length > 0) {
1204                 /* Check if the remaining data fits into current packet. */
1205                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1206                 if (copy < length)
1207                         copy = maxfraglen - skb->len;
1208
1209                 if (copy <= 0) {
1210                         char *data;
1211                         unsigned int datalen;
1212                         unsigned int fraglen;
1213                         unsigned int fraggap;
1214                         unsigned int alloclen;
1215                         struct sk_buff *skb_prev;
1216 alloc_new_skb:
1217                         skb_prev = skb;
1218
1219                         /* There's no room in the current skb */
1220                         if (skb_prev)
1221                                 fraggap = skb_prev->len - maxfraglen;
1222                         else
1223                                 fraggap = 0;
1224
1225                         /*
1226                          * If remaining data exceeds the mtu,
1227                          * we know we need more fragment(s).
1228                          */
1229                         datalen = length + fraggap;
1230                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1231                                 datalen = maxfraglen - fragheaderlen;
1232
1233                         fraglen = datalen + fragheaderlen;
1234                         if ((flags & MSG_MORE) &&
1235                             !(rt->u.dst.dev->features&NETIF_F_SG))
1236                                 alloclen = mtu;
1237                         else
1238                                 alloclen = datalen + fragheaderlen;
1239
1240                         /*
1241                          * The last fragment gets additional space at tail.
1242                          * Note: we overallocate on fragments with MSG_MODE
1243                          * because we have no idea if we're the last one.
1244                          */
1245                         if (datalen == length + fraggap)
1246                                 alloclen += rt->u.dst.trailer_len;
1247
1248                         /*
1249                          * We just reserve space for fragment header.
1250                          * Note: this may be overallocation if the message
1251                          * (without MSG_MORE) fits into the MTU.
1252                          */
1253                         alloclen += sizeof(struct frag_hdr);
1254
1255                         if (transhdrlen) {
1256                                 skb = sock_alloc_send_skb(sk,
1257                                                 alloclen + hh_len,
1258                                                 (flags & MSG_DONTWAIT), &err);
1259                         } else {
1260                                 skb = NULL;
1261                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1262                                     2 * sk->sk_sndbuf)
1263                                         skb = sock_wmalloc(sk,
1264                                                            alloclen + hh_len, 1,
1265                                                            sk->sk_allocation);
1266                                 if (unlikely(skb == NULL))
1267                                         err = -ENOBUFS;
1268                         }
1269                         if (skb == NULL)
1270                                 goto error;
1271                         /*
1272                          *      Fill in the control structures
1273                          */
1274                         skb->ip_summed = csummode;
1275                         skb->csum = 0;
1276                         /* reserve for fragmentation */
1277                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1278
1279                         /*
1280                          *      Find where to start putting bytes
1281                          */
1282                         data = skb_put(skb, fraglen);
1283                         skb_set_network_header(skb, exthdrlen);
1284                         data += fragheaderlen;
1285                         skb->transport_header = (skb->network_header +
1286                                                  fragheaderlen);
1287                         if (fraggap) {
1288                                 skb->csum = skb_copy_and_csum_bits(
1289                                         skb_prev, maxfraglen,
1290                                         data + transhdrlen, fraggap, 0);
1291                                 skb_prev->csum = csum_sub(skb_prev->csum,
1292                                                           skb->csum);
1293                                 data += fraggap;
1294                                 pskb_trim_unique(skb_prev, maxfraglen);
1295                         }
1296                         copy = datalen - transhdrlen - fraggap;
1297                         if (copy < 0) {
1298                                 err = -EINVAL;
1299                                 kfree_skb(skb);
1300                                 goto error;
1301                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1302                                 err = -EFAULT;
1303                                 kfree_skb(skb);
1304                                 goto error;
1305                         }
1306
1307                         offset += copy;
1308                         length -= datalen - fraggap;
1309                         transhdrlen = 0;
1310                         exthdrlen = 0;
1311                         csummode = CHECKSUM_NONE;
1312
1313                         /*
1314                          * Put the packet on the pending queue
1315                          */
1316                         __skb_queue_tail(&sk->sk_write_queue, skb);
1317                         continue;
1318                 }
1319
1320                 if (copy > length)
1321                         copy = length;
1322
1323                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1324                         unsigned int off;
1325
1326                         off = skb->len;
1327                         if (getfrag(from, skb_put(skb, copy),
1328                                                 offset, copy, off, skb) < 0) {
1329                                 __skb_trim(skb, off);
1330                                 err = -EFAULT;
1331                                 goto error;
1332                         }
1333                 } else {
1334                         int i = skb_shinfo(skb)->nr_frags;
1335                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1336                         struct page *page = sk->sk_sndmsg_page;
1337                         int off = sk->sk_sndmsg_off;
1338                         unsigned int left;
1339
1340                         if (page && (left = PAGE_SIZE - off) > 0) {
1341                                 if (copy >= left)
1342                                         copy = left;
1343                                 if (page != frag->page) {
1344                                         if (i == MAX_SKB_FRAGS) {
1345                                                 err = -EMSGSIZE;
1346                                                 goto error;
1347                                         }
1348                                         get_page(page);
1349                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1350                                         frag = &skb_shinfo(skb)->frags[i];
1351                                 }
1352                         } else if(i < MAX_SKB_FRAGS) {
1353                                 if (copy > PAGE_SIZE)
1354                                         copy = PAGE_SIZE;
1355                                 page = alloc_pages(sk->sk_allocation, 0);
1356                                 if (page == NULL) {
1357                                         err = -ENOMEM;
1358                                         goto error;
1359                                 }
1360                                 sk->sk_sndmsg_page = page;
1361                                 sk->sk_sndmsg_off = 0;
1362
1363                                 skb_fill_page_desc(skb, i, page, 0, 0);
1364                                 frag = &skb_shinfo(skb)->frags[i];
1365                         } else {
1366                                 err = -EMSGSIZE;
1367                                 goto error;
1368                         }
1369                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1370                                 err = -EFAULT;
1371                                 goto error;
1372                         }
1373                         sk->sk_sndmsg_off += copy;
1374                         frag->size += copy;
1375                         skb->len += copy;
1376                         skb->data_len += copy;
1377                         skb->truesize += copy;
1378                         atomic_add(copy, &sk->sk_wmem_alloc);
1379                 }
1380                 offset += copy;
1381                 length -= copy;
1382         }
1383         return 0;
1384 error:
1385         inet->cork.length -= length;
1386         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1387         return err;
1388 }
1389
1390 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1391 {
1392         inet->cork.flags &= ~IPCORK_OPT;
1393         kfree(np->cork.opt);
1394         np->cork.opt = NULL;
1395         if (inet->cork.dst) {
1396                 dst_release(inet->cork.dst);
1397                 inet->cork.dst = NULL;
1398                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1399         }
1400         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1401 }
1402
1403 int ip6_push_pending_frames(struct sock *sk)
1404 {
1405         struct sk_buff *skb, *tmp_skb;
1406         struct sk_buff **tail_skb;
1407         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1408         struct inet_sock *inet = inet_sk(sk);
1409         struct ipv6_pinfo *np = inet6_sk(sk);
1410         struct ipv6hdr *hdr;
1411         struct ipv6_txoptions *opt = np->cork.opt;
1412         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1413         struct flowi *fl = &inet->cork.fl;
1414         unsigned char proto = fl->proto;
1415         int err = 0;
1416
1417         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1418                 goto out;
1419         tail_skb = &(skb_shinfo(skb)->frag_list);
1420
1421         /* move skb->data to ip header from ext header */
1422         if (skb->data < skb_network_header(skb))
1423                 __skb_pull(skb, skb_network_offset(skb));
1424         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1425                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1426                 *tail_skb = tmp_skb;
1427                 tail_skb = &(tmp_skb->next);
1428                 skb->len += tmp_skb->len;
1429                 skb->data_len += tmp_skb->len;
1430                 skb->truesize += tmp_skb->truesize;
1431                 __sock_put(tmp_skb->sk);
1432                 tmp_skb->destructor = NULL;
1433                 tmp_skb->sk = NULL;
1434         }
1435
1436         /* Allow local fragmentation. */
1437         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1438                 skb->local_df = 1;
1439
1440         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1441         __skb_pull(skb, skb_network_header_len(skb));
1442         if (opt && opt->opt_flen)
1443                 ipv6_push_frag_opts(skb, opt, &proto);
1444         if (opt && opt->opt_nflen)
1445                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1446
1447         skb_push(skb, sizeof(struct ipv6hdr));
1448         skb_reset_network_header(skb);
1449         hdr = ipv6_hdr(skb);
1450
1451         *(__be32*)hdr = fl->fl6_flowlabel |
1452                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1453
1454         hdr->hop_limit = np->cork.hop_limit;
1455         hdr->nexthdr = proto;
1456         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1457         ipv6_addr_copy(&hdr->daddr, final_dst);
1458
1459         skb->priority = sk->sk_priority;
1460         skb->mark = sk->sk_mark;
1461
1462         skb->dst = dst_clone(&rt->u.dst);
1463         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1464         if (proto == IPPROTO_ICMPV6) {
1465                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1466
1467                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1468                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1469         }
1470
1471         err = ip6_local_out(skb);
1472         if (err) {
1473                 if (err > 0)
1474                         err = np->recverr ? net_xmit_errno(err) : 0;
1475                 if (err)
1476                         goto error;
1477         }
1478
1479 out:
1480         ip6_cork_release(inet, np);
1481         return err;
1482 error:
1483         goto out;
1484 }
1485
1486 void ip6_flush_pending_frames(struct sock *sk)
1487 {
1488         struct sk_buff *skb;
1489
1490         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1491                 if (skb->dst)
1492                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1493                                       IPSTATS_MIB_OUTDISCARDS);
1494                 kfree_skb(skb);
1495         }
1496
1497         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1498 }