net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 int __ip6_local_out(struct sk_buff *skb)
  61 {
  62         int len;
  63
  64         len = skb->len - sizeof(struct ipv6hdr);
  65         if (len > IPV6_MAXPLEN)
  66                 len = 0;
  67         ipv6_hdr(skb)->payload_len = htons(len);
  68
  69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  70                        dst_output);
  71 }
  72
  73 int ip6_local_out(struct sk_buff *skb)
  74 {
  75         int err;
  76
  77         err = __ip6_local_out(skb);
  78         if (likely(err == 1))
  79                 err = dst_output(skb);
  80
  81         return err;
  82 }
  83 EXPORT_SYMBOL_GPL(ip6_local_out);
  84
  85 static int ip6_output_finish(struct sk_buff *skb)
  86 {
  87         struct dst_entry *dst = skb_dst(skb);
  88
  89         if (dst->hh)
  90                 return neigh_hh_output(dst->hh, skb);
  91         else if (dst->neighbour)
  92                 return dst->neighbour->output(skb);
  93
  94         IP6_INC_STATS_BH(dev_net(dst->dev),
  95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  96         kfree_skb(skb);
  97         return -EINVAL;
  98
  99 }
 100
 101 /* dev_loopback_xmit for use with netfilter. */
 102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 103 {
 104         skb_reset_mac_header(newskb);
 105         __skb_pull(newskb, skb_network_offset(newskb));
 106         newskb->pkt_type = PACKET_LOOPBACK;
 107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 108         WARN_ON(!skb_dst(newskb));
 109
 110         netif_rx(newskb);
 111         return 0;
 112 }
 113
 114
 115 static int ip6_output2(struct sk_buff *skb)
 116 {
 117         struct dst_entry *dst = skb_dst(skb);
 118         struct net_device *dev = dst->dev;
 119
 120         skb->protocol = htons(ETH_P_IPV6);
 121         skb->dev = dev;
 122
 123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 124                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 125
 126                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 127                     ((mroute6_socket(dev_net(dev)) &&
 128                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 129                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 130                                          &ipv6_hdr(skb)->saddr))) {
 131                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 132
 133                         /* Do not check for IFF_ALLMULTI; multicast routing
 134                            is not supported in any case.
 135                          */
 136                         if (newskb)
 137                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 138                                         NULL, newskb->dev,
 139                                         ip6_dev_loopback_xmit);
 140
 141                         if (ipv6_hdr(skb)->hop_limit == 0) {
 142                                 IP6_INC_STATS(dev_net(dev), idev,
 143                                               IPSTATS_MIB_OUTDISCARDS);
 144                                 kfree_skb(skb);
 145                                 return 0;
 146                         }
 147                 }
 148
 149                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 150                                 skb->len);
 151         }
 152
 153         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 154                        ip6_output_finish);
 155 }
 156
 157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 158 {
 159         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 160
 161         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 162                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 163 }
 164
 165 int ip6_output(struct sk_buff *skb)
 166 {
 167         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 168         if (unlikely(idev->cnf.disable_ipv6)) {
 169                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 170                               IPSTATS_MIB_OUTDISCARDS);
 171                 kfree_skb(skb);
 172                 return 0;
 173         }
 174
 175         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 176                                 dst_allfrag(skb_dst(skb)))
 177                 return ip6_fragment(skb, ip6_output2);
 178         else
 179                 return ip6_output2(skb);
 180 }
 181
 182 /*
 183  *      xmit an sk_buff (used by TCP)
 184  */
 185
 186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 187              struct ipv6_txoptions *opt, int ipfragok)
 188 {
 189         struct net *net = sock_net(sk);
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb_dst(skb);
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit = -1;
 197         int tclass = 0;
 198         u32 mtu;
 199
 200         if (opt) {
 201                 unsigned int head_room;
 202
 203                 /* First: exthdrs may take lots of space (~8K for now)
 204                    MAX_HEADER is not enough.
 205                  */
 206                 head_room = opt->opt_nflen + opt->opt_flen;
 207                 seg_len += head_room;
 208                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 209
 210                 if (skb_headroom(skb) < head_room) {
 211                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 212                         if (skb2 == NULL) {
 213                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 214                                               IPSTATS_MIB_OUTDISCARDS);
 215                                 kfree_skb(skb);
 216                                 return -ENOBUFS;
 217                         }
 218                         kfree_skb(skb);
 219                         skb = skb2;
 220                         if (sk)
 221                                 skb_set_owner_w(skb, sk);
 222                 }
 223                 if (opt->opt_flen)
 224                         ipv6_push_frag_opts(skb, opt, &proto);
 225                 if (opt->opt_nflen)
 226                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 227         }
 228
 229         skb_push(skb, sizeof(struct ipv6hdr));
 230         skb_reset_network_header(skb);
 231         hdr = ipv6_hdr(skb);
 232
 233         /* Allow local fragmentation. */
 234         if (ipfragok)
 235                 skb->local_df = 1;
 236
 237         /*
 238          *      Fill in the IPv6 header
 239          */
 240         if (np) {
 241                 tclass = np->tclass;
 242                 hlimit = np->hop_limit;
 243         }
 244         if (hlimit < 0)
 245                 hlimit = ip6_dst_hoplimit(dst);
 246
 247         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 254         ipv6_addr_copy(&hdr->daddr, first_hop);
 255
 256         skb->priority = sk->sk_priority;
 257         skb->mark = sk->sk_mark;
 258
 259         mtu = dst_mtu(dst);
 260         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 261                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 262                               IPSTATS_MIB_OUT, skb->len);
 263                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 264                                 dst_output);
 265         }
 266
 267         if (net_ratelimit())
 268                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 269         skb->dev = dst->dev;
 270         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 272         kfree_skb(skb);
 273         return -EMSGSIZE;
 274 }
 275
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 /*
 279  *      To avoid extra problems ND packets are send through this
 280  *      routine. It's code duplication but I really want to avoid
 281  *      extra checks since ipv6_build_header is used by TCP (which
 282  *      is for us performance critical)
 283  */
 284
 285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 286                const struct in6_addr *saddr, const struct in6_addr *daddr,
 287                int proto, int len)
 288 {
 289         struct ipv6_pinfo *np = inet6_sk(sk);
 290         struct ipv6hdr *hdr;
 291         int totlen;
 292
 293         skb->protocol = htons(ETH_P_IPV6);
 294         skb->dev = dev;
 295
 296         totlen = len + sizeof(struct ipv6hdr);
 297
 298         skb_reset_network_header(skb);
 299         skb_put(skb, sizeof(struct ipv6hdr));
 300         hdr = ipv6_hdr(skb);
 301
 302         *(__be32*)hdr = htonl(0x60000000);
 303
 304         hdr->payload_len = htons(len);
 305         hdr->nexthdr = proto;
 306         hdr->hop_limit = np->hop_limit;
 307
 308         ipv6_addr_copy(&hdr->saddr, saddr);
 309         ipv6_addr_copy(&hdr->daddr, daddr);
 310
 311         return 0;
 312 }
 313
 314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 315 {
 316         struct ip6_ra_chain *ra;
 317         struct sock *last = NULL;
 318
 319         read_lock(&ip6_ra_lock);
 320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 321                 struct sock *sk = ra->sk;
 322                 if (sk && ra->sel == sel &&
 323                     (!sk->sk_bound_dev_if ||
 324                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static int ip6_forward_proxy_check(struct sk_buff *skb)
 344 {
 345         struct ipv6hdr *hdr = ipv6_hdr(skb);
 346         u8 nexthdr = hdr->nexthdr;
 347         int offset;
 348
 349         if (ipv6_ext_hdr(nexthdr)) {
 350                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 351                 if (offset < 0)
 352                         return 0;
 353         } else
 354                 offset = sizeof(struct ipv6hdr);
 355
 356         if (nexthdr == IPPROTO_ICMPV6) {
 357                 struct icmp6hdr *icmp6;
 358
 359                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 360                                          offset + 1 - skb->data)))
 361                         return 0;
 362
 363                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 364
 365                 switch (icmp6->icmp6_type) {
 366                 case NDISC_ROUTER_SOLICITATION:
 367                 case NDISC_ROUTER_ADVERTISEMENT:
 368                 case NDISC_NEIGHBOUR_SOLICITATION:
 369                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 370                 case NDISC_REDIRECT:
 371                         /* For reaction involving unicast neighbor discovery
 372                          * message destined to the proxied address, pass it to
 373                          * input function.
 374                          */
 375                         return 1;
 376                 default:
 377                         break;
 378                 }
 379         }
 380
 381         /*
 382          * The proxying router can't forward traffic sent to a link-local
 383          * address, so signal the sender and discard the packet. This
 384          * behavior is clarified by the MIPv6 specification.
 385          */
 386         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 387                 dst_link_failure(skb);
 388                 return -1;
 389         }
 390
 391         return 0;
 392 }
 393
 394 static inline int ip6_forward_finish(struct sk_buff *skb)
 395 {
 396         return dst_output(skb);
 397 }
 398
 399 int ip6_forward(struct sk_buff *skb)
 400 {
 401         struct dst_entry *dst = skb_dst(skb);
 402         struct ipv6hdr *hdr = ipv6_hdr(skb);
 403         struct inet6_skb_parm *opt = IP6CB(skb);
 404         struct net *net = dev_net(dst->dev);
 405
 406         if (net->ipv6.devconf_all->forwarding == 0)
 407                 goto error;
 408
 409         if (skb_warn_if_lro(skb))
 410                 goto drop;
 411
 412         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 413                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 414                 goto drop;
 415         }
 416
 417         skb_forward_csum(skb);
 418
 419         /*
 420          *      We DO NOT make any processing on
 421          *      RA packets, pushing them to user level AS IS
 422          *      without ane WARRANTY that application will be able
 423          *      to interpret them. The reason is that we
 424          *      cannot make anything clever here.
 425          *
 426          *      We are not end-node, so that if packet contains
 427          *      AH/ESP, we cannot make anything.
 428          *      Defragmentation also would be mistake, RA packets
 429          *      cannot be fragmented, because there is no warranty
 430          *      that different fragments will go along one path. --ANK
 431          */
 432         if (opt->ra) {
 433                 u8 *ptr = skb_network_header(skb) + opt->ra;
 434                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 435                         return 0;
 436         }
 437
 438         /*
 439          *      check and decrement ttl
 440          */
 441         if (hdr->hop_limit <= 1) {
 442                 /* Force OUTPUT device used as source address */
 443                 skb->dev = dst->dev;
 444                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 445                             0, skb->dev);
 446                 IP6_INC_STATS_BH(net,
 447                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 448
 449                 kfree_skb(skb);
 450                 return -ETIMEDOUT;
 451         }
 452
 453         /* XXX: idev->cnf.proxy_ndp? */
 454         if (net->ipv6.devconf_all->proxy_ndp &&
 455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 456                 int proxied = ip6_forward_proxy_check(skb);
 457                 if (proxied > 0)
 458                         return ip6_input(skb);
 459                 else if (proxied < 0) {
 460                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 461                                       IPSTATS_MIB_INDISCARDS);
 462                         goto drop;
 463                 }
 464         }
 465
 466         if (!xfrm6_route_forward(skb)) {
 467                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 468                 goto drop;
 469         }
 470         dst = skb_dst(skb);
 471
 472         /* IPv6 specs say nothing about it, but it is clear that we cannot
 473            send redirects to source routed frames.
 474            We don't send redirects to frames decapsulated from IPsec.
 475          */
 476         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 477             !skb_sec_path(skb)) {
 478                 struct in6_addr *target = NULL;
 479                 struct rt6_info *rt;
 480                 struct neighbour *n = dst->neighbour;
 481
 482                 /*
 483                  *      incoming and outgoing devices are the same
 484                  *      send a redirect.
 485                  */
 486
 487                 rt = (struct rt6_info *) dst;
 488                 if ((rt->rt6i_flags & RTF_GATEWAY))
 489                         target = (struct in6_addr*)&n->primary_key;
 490                 else
 491                         target = &hdr->daddr;
 492
 493                 /* Limit redirects both by destination (here)
 494                    and by source (inside ndisc_send_redirect)
 495                  */
 496                 if (xrlim_allow(dst, 1*HZ))
 497                         ndisc_send_redirect(skb, n, target);
 498         } else {
 499                 int addrtype = ipv6_addr_type(&hdr->saddr);
 500
 501                 /* This check is security critical. */
 502                 if (addrtype == IPV6_ADDR_ANY ||
 503                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 504                         goto error;
 505                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 506                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 507                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 508                         goto error;
 509                 }
 510         }
 511
 512         if (skb->len > dst_mtu(dst)) {
 513                 /* Again, force OUTPUT device used as source address */
 514                 skb->dev = dst->dev;
 515                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 516                 IP6_INC_STATS_BH(net,
 517                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 518                 IP6_INC_STATS_BH(net,
 519                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 520                 kfree_skb(skb);
 521                 return -EMSGSIZE;
 522         }
 523
 524         if (skb_cow(skb, dst->dev->hard_header_len)) {
 525                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 526                 goto drop;
 527         }
 528
 529         hdr = ipv6_hdr(skb);
 530
 531         /* Mangling hops number delayed to point after skb COW */
 532
 533         hdr->hop_limit--;
 534
 535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 536         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 537                        ip6_forward_finish);
 538
 539 error:
 540         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 541 drop:
 542         kfree_skb(skb);
 543         return -EINVAL;
 544 }
 545
 546 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 547 {
 548         to->pkt_type = from->pkt_type;
 549         to->priority = from->priority;
 550         to->protocol = from->protocol;
 551         skb_dst_drop(to);
 552         skb_dst_set(to, dst_clone(skb_dst(from)));
 553         to->dev = from->dev;
 554         to->mark = from->mark;
 555
 556 #ifdef CONFIG_NET_SCHED
 557         to->tc_index = from->tc_index;
 558 #endif
 559         nf_copy(to, from);
 560 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 561     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 562         to->nf_trace = from->nf_trace;
 563 #endif
 564         skb_copy_secmark(to, from);
 565 }
 566
 567 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 568 {
 569         u16 offset = sizeof(struct ipv6hdr);
 570         struct ipv6_opt_hdr *exthdr =
 571                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 572         unsigned int packet_len = skb->tail - skb->network_header;
 573         int found_rhdr = 0;
 574         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 575
 576         while (offset + 1 <= packet_len) {
 577
 578                 switch (**nexthdr) {
 579
 580                 case NEXTHDR_HOP:
 581                         break;
 582                 case NEXTHDR_ROUTING:
 583                         found_rhdr = 1;
 584                         break;
 585                 case NEXTHDR_DEST:
 586 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 587                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 588                                 break;
 589 #endif
 590                         if (found_rhdr)
 591                                 return offset;
 592                         break;
 593                 default :
 594                         return offset;
 595                 }
 596
 597                 offset += ipv6_optlen(exthdr);
 598                 *nexthdr = &exthdr->nexthdr;
 599                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 600                                                  offset);
 601         }
 602
 603         return offset;
 604 }
 605
 606 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 607 {
 608         struct sk_buff *frag;
 609         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 610         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 611         struct ipv6hdr *tmp_hdr;
 612         struct frag_hdr *fh;
 613         unsigned int mtu, hlen, left, len;
 614         __be32 frag_id = 0;
 615         int ptr, offset = 0, err=0;
 616         u8 *prevhdr, nexthdr = 0;
 617         struct net *net = dev_net(skb_dst(skb)->dev);
 618
 619         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 620         nexthdr = *prevhdr;
 621
 622         mtu = ip6_skb_dst_mtu(skb);
 623
 624         /* We must not fragment if the socket is set to force MTU discovery
 625          * or if the skb it not generated by a local socket.  (This last
 626          * check should be redundant, but it's free.)
 627          */
 628         if (!skb->local_df) {
 629                 skb->dev = skb_dst(skb)->dev;
 630                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 631                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 632                               IPSTATS_MIB_FRAGFAILS);
 633                 kfree_skb(skb);
 634                 return -EMSGSIZE;
 635         }
 636
 637         if (np && np->frag_size < mtu) {
 638                 if (np->frag_size)
 639                         mtu = np->frag_size;
 640         }
 641         mtu -= hlen + sizeof(struct frag_hdr);
 642
 643         if (skb_has_frags(skb)) {
 644                 int first_len = skb_pagelen(skb);
 645                 int truesizes = 0;
 646
 647                 if (first_len - hlen > mtu ||
 648                     ((first_len - hlen) & 7) ||
 649                     skb_cloned(skb))
 650                         goto slow_path;
 651
 652                 skb_walk_frags(skb, frag) {
 653                         /* Correct geometry. */
 654                         if (frag->len > mtu ||
 655                             ((frag->len & 7) && frag->next) ||
 656                             skb_headroom(frag) < hlen)
 657                             goto slow_path;
 658
 659                         /* Partially cloned skb? */
 660                         if (skb_shared(frag))
 661                                 goto slow_path;
 662
 663                         BUG_ON(frag->sk);
 664                         if (skb->sk) {
 665                                 frag->sk = skb->sk;
 666                                 frag->destructor = sock_wfree;
 667                                 truesizes += frag->truesize;
 668                         }
 669                 }
 670
 671                 err = 0;
 672                 offset = 0;
 673                 frag = skb_shinfo(skb)->frag_list;
 674                 skb_frag_list_init(skb);
 675                 /* BUILD HEADER */
 676
 677                 *prevhdr = NEXTHDR_FRAGMENT;
 678                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 679                 if (!tmp_hdr) {
 680                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 681                                       IPSTATS_MIB_FRAGFAILS);
 682                         return -ENOMEM;
 683                 }
 684
 685                 __skb_pull(skb, hlen);
 686                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 687                 __skb_push(skb, hlen);
 688                 skb_reset_network_header(skb);
 689                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 690
 691                 ipv6_select_ident(fh);
 692                 fh->nexthdr = nexthdr;
 693                 fh->reserved = 0;
 694                 fh->frag_off = htons(IP6_MF);
 695                 frag_id = fh->identification;
 696
 697                 first_len = skb_pagelen(skb);
 698                 skb->data_len = first_len - skb_headlen(skb);
 699                 skb->truesize -= truesizes;
 700                 skb->len = first_len;
 701                 ipv6_hdr(skb)->payload_len = htons(first_len -
 702                                                    sizeof(struct ipv6hdr));
 703
 704                 dst_hold(&rt->u.dst);
 705
 706                 for (;;) {
 707                         /* Prepare header of the next frame,
 708                          * before previous one went down. */
 709                         if (frag) {
 710                                 frag->ip_summed = CHECKSUM_NONE;
 711                                 skb_reset_transport_header(frag);
 712                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 713                                 __skb_push(frag, hlen);
 714                                 skb_reset_network_header(frag);
 715                                 memcpy(skb_network_header(frag), tmp_hdr,
 716                                        hlen);
 717                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 718                                 fh->nexthdr = nexthdr;
 719                                 fh->reserved = 0;
 720                                 fh->frag_off = htons(offset);
 721                                 if (frag->next != NULL)
 722                                         fh->frag_off |= htons(IP6_MF);
 723                                 fh->identification = frag_id;
 724                                 ipv6_hdr(frag)->payload_len =
 725                                                 htons(frag->len -
 726                                                       sizeof(struct ipv6hdr));
 727                                 ip6_copy_metadata(frag, skb);
 728                         }
 729
 730                         err = output(skb);
 731                         if(!err)
 732                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 733                                               IPSTATS_MIB_FRAGCREATES);
 734
 735                         if (err || !frag)
 736                                 break;
 737
 738                         skb = frag;
 739                         frag = skb->next;
 740                         skb->next = NULL;
 741                 }
 742
 743                 kfree(tmp_hdr);
 744
 745                 if (err == 0) {
 746                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 747                                       IPSTATS_MIB_FRAGOKS);
 748                         dst_release(&rt->u.dst);
 749                         return 0;
 750                 }
 751
 752                 while (frag) {
 753                         skb = frag->next;
 754                         kfree_skb(frag);
 755                         frag = skb;
 756                 }
 757
 758                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 759                               IPSTATS_MIB_FRAGFAILS);
 760                 dst_release(&rt->u.dst);
 761                 return err;
 762         }
 763
 764 slow_path:
 765         left = skb->len - hlen;         /* Space per frame */
 766         ptr = hlen;                     /* Where to start from */
 767
 768         /*
 769          *      Fragment the datagram.
 770          */
 771
 772         *prevhdr = NEXTHDR_FRAGMENT;
 773
 774         /*
 775          *      Keep copying data until we run out.
 776          */
 777         while(left > 0) {
 778                 len = left;
 779                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 780                 if (len > mtu)
 781                         len = mtu;
 782                 /* IF: we are not sending upto and including the packet end
 783                    then align the next start on an eight byte boundary */
 784                 if (len < left) {
 785                         len &= ~7;
 786                 }
 787                 /*
 788                  *      Allocate buffer.
 789                  */
 790
 791                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 792                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 793                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 794                                       IPSTATS_MIB_FRAGFAILS);
 795                         err = -ENOMEM;
 796                         goto fail;
 797                 }
 798
 799                 /*
 800                  *      Set up data on packet
 801                  */
 802
 803                 ip6_copy_metadata(frag, skb);
 804                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 805                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 806                 skb_reset_network_header(frag);
 807                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 808                 frag->transport_header = (frag->network_header + hlen +
 809                                           sizeof(struct frag_hdr));
 810
 811                 /*
 812                  *      Charge the memory for the fragment to any owner
 813                  *      it might possess
 814                  */
 815                 if (skb->sk)
 816                         skb_set_owner_w(frag, skb->sk);
 817
 818                 /*
 819                  *      Copy the packet header into the new buffer.
 820                  */
 821                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 822
 823                 /*
 824                  *      Build fragment header.
 825                  */
 826                 fh->nexthdr = nexthdr;
 827                 fh->reserved = 0;
 828                 if (!frag_id) {
 829                         ipv6_select_ident(fh);
 830                         frag_id = fh->identification;
 831                 } else
 832                         fh->identification = frag_id;
 833
 834                 /*
 835                  *      Copy a block of the IP datagram.
 836                  */
 837                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 838                         BUG();
 839                 left -= len;
 840
 841                 fh->frag_off = htons(offset);
 842                 if (left > 0)
 843                         fh->frag_off |= htons(IP6_MF);
 844                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 845                                                     sizeof(struct ipv6hdr));
 846
 847                 ptr += len;
 848                 offset += len;
 849
 850                 /*
 851                  *      Put this fragment into the sending queue.
 852                  */
 853                 err = output(frag);
 854                 if (err)
 855                         goto fail;
 856
 857                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 858                               IPSTATS_MIB_FRAGCREATES);
 859         }
 860         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 861                       IPSTATS_MIB_FRAGOKS);
 862         kfree_skb(skb);
 863         return err;
 864
 865 fail:
 866         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 867                       IPSTATS_MIB_FRAGFAILS);
 868         kfree_skb(skb);
 869         return err;
 870 }
 871
 872 static inline int ip6_rt_check(struct rt6key *rt_key,
 873                                struct in6_addr *fl_addr,
 874                                struct in6_addr *addr_cache)
 875 {
 876         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 877                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 878 }
 879
 880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 881                                           struct dst_entry *dst,
 882                                           struct flowi *fl)
 883 {
 884         struct ipv6_pinfo *np = inet6_sk(sk);
 885         struct rt6_info *rt = (struct rt6_info *)dst;
 886
 887         if (!dst)
 888                 goto out;
 889
 890         /* Yes, checking route validity in not connected
 891          * case is not very simple. Take into account,
 892          * that we do not support routing by source, TOS,
 893          * and MSG_DONTROUTE            --ANK (980726)
 894          *
 895          * 1. ip6_rt_check(): If route was host route,
 896          *    check that cached destination is current.
 897          *    If it is network route, we still may
 898          *    check its validity using saved pointer
 899          *    to the last used address: daddr_cache.
 900          *    We do not want to save whole address now,
 901          *    (because main consumer of this service
 902          *    is tcp, which has not this problem),
 903          *    so that the last trick works only on connected
 904          *    sockets.
 905          * 2. oif also should be the same.
 906          */
 907         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 908 #ifdef CONFIG_IPV6_SUBTREES
 909             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 910 #endif
 911             (fl->oif && fl->oif != dst->dev->ifindex)) {
 912                 dst_release(dst);
 913                 dst = NULL;
 914         }
 915
 916 out:
 917         return dst;
 918 }
 919
 920 static int ip6_dst_lookup_tail(struct sock *sk,
 921                                struct dst_entry **dst, struct flowi *fl)
 922 {
 923         int err;
 924         struct net *net = sock_net(sk);
 925
 926         if (*dst == NULL)
 927                 *dst = ip6_route_output(net, sk, fl);
 928
 929         if ((err = (*dst)->error))
 930                 goto out_err_release;
 931
 932         if (ipv6_addr_any(&fl->fl6_src)) {
 933                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 934                                          &fl->fl6_dst,
 935                                          sk ? inet6_sk(sk)->srcprefs : 0,
 936                                          &fl->fl6_src);
 937                 if (err)
 938                         goto out_err_release;
 939         }
 940
 941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 942         /*
 943          * Here if the dst entry we've looked up
 944          * has a neighbour entry that is in the INCOMPLETE
 945          * state and the src address from the flow is
 946          * marked as OPTIMISTIC, we release the found
 947          * dst entry and replace it instead with the
 948          * dst entry of the nexthop router
 949          */
 950         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 951                 struct inet6_ifaddr *ifp;
 952                 struct flowi fl_gw;
 953                 int redirect;
 954
 955                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 956                                       (*dst)->dev, 1);
 957
 958                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 959                 if (ifp)
 960                         in6_ifa_put(ifp);
 961
 962                 if (redirect) {
 963                         /*
 964                          * We need to get the dst entry for the
 965                          * default router instead
 966                          */
 967                         dst_release(*dst);
 968                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 969                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 970                         *dst = ip6_route_output(net, sk, &fl_gw);
 971                         if ((err = (*dst)->error))
 972                                 goto out_err_release;
 973                 }
 974         }
 975 #endif
 976
 977         return 0;
 978
 979 out_err_release:
 980         if (err == -ENETUNREACH)
 981                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 982         dst_release(*dst);
 983         *dst = NULL;
 984         return err;
 985 }
 986
 987 /**
 988  *      ip6_dst_lookup - perform route lookup on flow
 989  *      @sk: socket which provides route info
 990  *      @dst: pointer to dst_entry * for result
 991  *      @fl: flow to lookup
 992  *
 993  *      This function performs a route lookup on the given flow.
 994  *
 995  *      It returns zero on success, or a standard errno code on error.
 996  */
 997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 998 {
 999         *dst = NULL;
1000         return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
1004 /**
1005  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *      @sk: socket which provides the dst cache and route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow with the
1011  *      possibility of using the cached route in the socket if it is valid.
1012  *      It will take the socket dst lock when operating on the dst cache.
1013  *      As a result, this function can only be used in process context.
1014  *
1015  *      It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019         *dst = NULL;
1020         if (sk) {
1021                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1023         }
1024
1025         return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030                         int getfrag(void *from, char *to, int offset, int len,
1031                         int odd, struct sk_buff *skb),
1032                         void *from, int length, int hh_len, int fragheaderlen,
1033                         int transhdrlen, int mtu,unsigned int flags)
1034
1035 {
1036         struct sk_buff *skb;
1037         int err;
1038
1039         /* There is support for UDP large send offload by network
1040          * device, so create one single skb packet containing complete
1041          * udp datagram
1042          */
1043         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044                 skb = sock_alloc_send_skb(sk,
1045                         hh_len + fragheaderlen + transhdrlen + 20,
1046                         (flags & MSG_DONTWAIT), &err);
1047                 if (skb == NULL)
1048                         return -ENOMEM;
1049
1050                 /* reserve space for Hardware header */
1051                 skb_reserve(skb, hh_len);
1052
1053                 /* create space for UDP/IP header */
1054                 skb_put(skb,fragheaderlen + transhdrlen);
1055
1056                 /* initialize network header pointer */
1057                 skb_reset_network_header(skb);
1058
1059                 /* initialize protocol header pointer */
1060                 skb->transport_header = skb->network_header + fragheaderlen;
1061
1062                 skb->ip_summed = CHECKSUM_PARTIAL;
1063                 skb->csum = 0;
1064                 sk->sk_sndmsg_off = 0;
1065         }
1066
1067         err = skb_append_datato_frags(sk,skb, getfrag, from,
1068                                       (length - transhdrlen));
1069         if (!err) {
1070                 struct frag_hdr fhdr;
1071
1072                 /* Specify the length of each IPv6 datagram fragment.
1073                  * It has to be a multiple of 8.
1074                  */
1075                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1076                                              sizeof(struct frag_hdr)) & ~7;
1077                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1078                 ipv6_select_ident(&fhdr);
1079                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080                 __skb_queue_tail(&sk->sk_write_queue, skb);
1081
1082                 return 0;
1083         }
1084         /* There is not enough support do UPD LSO,
1085          * so follow normal path
1086          */
1087         kfree_skb(skb);
1088
1089         return err;
1090 }
1091
1092 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093                                                gfp_t gfp)
1094 {
1095         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096 }
1097
1098 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099                                                 gfp_t gfp)
1100 {
1101         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102 }
1103
1104 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105         int offset, int len, int odd, struct sk_buff *skb),
1106         void *from, int length, int transhdrlen,
1107         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1108         struct rt6_info *rt, unsigned int flags)
1109 {
1110         struct inet_sock *inet = inet_sk(sk);
1111         struct ipv6_pinfo *np = inet6_sk(sk);
1112         struct sk_buff *skb;
1113         unsigned int maxfraglen, fragheaderlen;
1114         int exthdrlen;
1115         int hh_len;
1116         int mtu;
1117         int copy;
1118         int err;
1119         int offset = 0;
1120         int csummode = CHECKSUM_NONE;
1121
1122         if (flags&MSG_PROBE)
1123                 return 0;
1124         if (skb_queue_empty(&sk->sk_write_queue)) {
1125                 /*
1126                  * setup for corking
1127                  */
1128                 if (opt) {
1129                         if (WARN_ON(np->cork.opt))
1130                                 return -EINVAL;
1131
1132                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1133                         if (unlikely(np->cork.opt == NULL))
1134                                 return -ENOBUFS;
1135
1136                         np->cork.opt->tot_len = opt->tot_len;
1137                         np->cork.opt->opt_flen = opt->opt_flen;
1138                         np->cork.opt->opt_nflen = opt->opt_nflen;
1139
1140                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141                                                             sk->sk_allocation);
1142                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1143                                 return -ENOBUFS;
1144
1145                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146                                                             sk->sk_allocation);
1147                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1148                                 return -ENOBUFS;
1149
1150                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151                                                            sk->sk_allocation);
1152                         if (opt->hopopt && !np->cork.opt->hopopt)
1153                                 return -ENOBUFS;
1154
1155                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156                                                             sk->sk_allocation);
1157                         if (opt->srcrt && !np->cork.opt->srcrt)
1158                                 return -ENOBUFS;
1159
1160                         /* need source address above miyazawa*/
1161                 }
1162                 dst_hold(&rt->u.dst);
1163                 inet->cork.dst = &rt->u.dst;
1164                 inet->cork.fl = *fl;
1165                 np->cork.hop_limit = hlimit;
1166                 np->cork.tclass = tclass;
1167                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1169                 if (np->frag_size < mtu) {
1170                         if (np->frag_size)
1171                                 mtu = np->frag_size;
1172                 }
1173                 inet->cork.fragsize = mtu;
1174                 if (dst_allfrag(rt->u.dst.path))
1175                         inet->cork.flags |= IPCORK_ALLFRAG;
1176                 inet->cork.length = 0;
1177                 sk->sk_sndmsg_page = NULL;
1178                 sk->sk_sndmsg_off = 0;
1179                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1180                             rt->rt6i_nfheader_len;
1181                 length += exthdrlen;
1182                 transhdrlen += exthdrlen;
1183         } else {
1184                 rt = (struct rt6_info *)inet->cork.dst;
1185                 fl = &inet->cork.fl;
1186                 opt = np->cork.opt;
1187                 transhdrlen = 0;
1188                 exthdrlen = 0;
1189                 mtu = inet->cork.fragsize;
1190         }
1191
1192         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1193
1194         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1195                         (opt ? opt->opt_nflen : 0);
1196         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197
1198         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1199                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1200                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1201                         return -EMSGSIZE;
1202                 }
1203         }
1204
1205         /*
1206          * Let's try using as much space as possible.
1207          * Use MTU if total length of the message fits into the MTU.
1208          * Otherwise, we need to reserve fragment header and
1209          * fragment alignment (= 8-15 octects, in total).
1210          *
1211          * Note that we may need to "move" the data from the tail of
1212          * of the buffer to the new fragment when we split
1213          * the message.
1214          *
1215          * FIXME: It may be fragmented into multiple chunks
1216          *        at once if non-fragmentable extension headers
1217          *        are too large.
1218          * --yoshfuji
1219          */
1220
1221         inet->cork.length += length;
1222         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1223             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1224
1225                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1226                                           fragheaderlen, transhdrlen, mtu,
1227                                           flags);
1228                 if (err)
1229                         goto error;
1230                 return 0;
1231         }
1232
1233         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1234                 goto alloc_new_skb;
1235
1236         while (length > 0) {
1237                 /* Check if the remaining data fits into current packet. */
1238                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1239                 if (copy < length)
1240                         copy = maxfraglen - skb->len;
1241
1242                 if (copy <= 0) {
1243                         char *data;
1244                         unsigned int datalen;
1245                         unsigned int fraglen;
1246                         unsigned int fraggap;
1247                         unsigned int alloclen;
1248                         struct sk_buff *skb_prev;
1249 alloc_new_skb:
1250                         skb_prev = skb;
1251
1252                         /* There's no room in the current skb */
1253                         if (skb_prev)
1254                                 fraggap = skb_prev->len - maxfraglen;
1255                         else
1256                                 fraggap = 0;
1257
1258                         /*
1259                          * If remaining data exceeds the mtu,
1260                          * we know we need more fragment(s).
1261                          */
1262                         datalen = length + fraggap;
1263                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1264                                 datalen = maxfraglen - fragheaderlen;
1265
1266                         fraglen = datalen + fragheaderlen;
1267                         if ((flags & MSG_MORE) &&
1268                             !(rt->u.dst.dev->features&NETIF_F_SG))
1269                                 alloclen = mtu;
1270                         else
1271                                 alloclen = datalen + fragheaderlen;
1272
1273                         /*
1274                          * The last fragment gets additional space at tail.
1275                          * Note: we overallocate on fragments with MSG_MODE
1276                          * because we have no idea if we're the last one.
1277                          */
1278                         if (datalen == length + fraggap)
1279                                 alloclen += rt->u.dst.trailer_len;
1280
1281                         /*
1282                          * We just reserve space for fragment header.
1283                          * Note: this may be overallocation if the message
1284                          * (without MSG_MORE) fits into the MTU.
1285                          */
1286                         alloclen += sizeof(struct frag_hdr);
1287
1288                         if (transhdrlen) {
1289                                 skb = sock_alloc_send_skb(sk,
1290                                                 alloclen + hh_len,
1291                                                 (flags & MSG_DONTWAIT), &err);
1292                         } else {
1293                                 skb = NULL;
1294                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1295                                     2 * sk->sk_sndbuf)
1296                                         skb = sock_wmalloc(sk,
1297                                                            alloclen + hh_len, 1,
1298                                                            sk->sk_allocation);
1299                                 if (unlikely(skb == NULL))
1300                                         err = -ENOBUFS;
1301                         }
1302                         if (skb == NULL)
1303                                 goto error;
1304                         /*
1305                          *      Fill in the control structures
1306                          */
1307                         skb->ip_summed = csummode;
1308                         skb->csum = 0;
1309                         /* reserve for fragmentation */
1310                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1311
1312                         /*
1313                          *      Find where to start putting bytes
1314                          */
1315                         data = skb_put(skb, fraglen);
1316                         skb_set_network_header(skb, exthdrlen);
1317                         data += fragheaderlen;
1318                         skb->transport_header = (skb->network_header +
1319                                                  fragheaderlen);
1320                         if (fraggap) {
1321                                 skb->csum = skb_copy_and_csum_bits(
1322                                         skb_prev, maxfraglen,
1323                                         data + transhdrlen, fraggap, 0);
1324                                 skb_prev->csum = csum_sub(skb_prev->csum,
1325                                                           skb->csum);
1326                                 data += fraggap;
1327                                 pskb_trim_unique(skb_prev, maxfraglen);
1328                         }
1329                         copy = datalen - transhdrlen - fraggap;
1330                         if (copy < 0) {
1331                                 err = -EINVAL;
1332                                 kfree_skb(skb);
1333                                 goto error;
1334                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1335                                 err = -EFAULT;
1336                                 kfree_skb(skb);
1337                                 goto error;
1338                         }
1339
1340                         offset += copy;
1341                         length -= datalen - fraggap;
1342                         transhdrlen = 0;
1343                         exthdrlen = 0;
1344                         csummode = CHECKSUM_NONE;
1345
1346                         /*
1347                          * Put the packet on the pending queue
1348                          */
1349                         __skb_queue_tail(&sk->sk_write_queue, skb);
1350                         continue;
1351                 }
1352
1353                 if (copy > length)
1354                         copy = length;
1355
1356                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1357                         unsigned int off;
1358
1359                         off = skb->len;
1360                         if (getfrag(from, skb_put(skb, copy),
1361                                                 offset, copy, off, skb) < 0) {
1362                                 __skb_trim(skb, off);
1363                                 err = -EFAULT;
1364                                 goto error;
1365                         }
1366                 } else {
1367                         int i = skb_shinfo(skb)->nr_frags;
1368                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1369                         struct page *page = sk->sk_sndmsg_page;
1370                         int off = sk->sk_sndmsg_off;
1371                         unsigned int left;
1372
1373                         if (page && (left = PAGE_SIZE - off) > 0) {
1374                                 if (copy >= left)
1375                                         copy = left;
1376                                 if (page != frag->page) {
1377                                         if (i == MAX_SKB_FRAGS) {
1378                                                 err = -EMSGSIZE;
1379                                                 goto error;
1380                                         }
1381                                         get_page(page);
1382                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1383                                         frag = &skb_shinfo(skb)->frags[i];
1384                                 }
1385                         } else if(i < MAX_SKB_FRAGS) {
1386                                 if (copy > PAGE_SIZE)
1387                                         copy = PAGE_SIZE;
1388                                 page = alloc_pages(sk->sk_allocation, 0);
1389                                 if (page == NULL) {
1390                                         err = -ENOMEM;
1391                                         goto error;
1392                                 }
1393                                 sk->sk_sndmsg_page = page;
1394                                 sk->sk_sndmsg_off = 0;
1395
1396                                 skb_fill_page_desc(skb, i, page, 0, 0);
1397                                 frag = &skb_shinfo(skb)->frags[i];
1398                         } else {
1399                                 err = -EMSGSIZE;
1400                                 goto error;
1401                         }
1402                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1403                                 err = -EFAULT;
1404                                 goto error;
1405                         }
1406                         sk->sk_sndmsg_off += copy;
1407                         frag->size += copy;
1408                         skb->len += copy;
1409                         skb->data_len += copy;
1410                         skb->truesize += copy;
1411                         atomic_add(copy, &sk->sk_wmem_alloc);
1412                 }
1413                 offset += copy;
1414                 length -= copy;
1415         }
1416         return 0;
1417 error:
1418         inet->cork.length -= length;
1419         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1420         return err;
1421 }
1422
1423 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1424 {
1425         if (np->cork.opt) {
1426                 kfree(np->cork.opt->dst0opt);
1427                 kfree(np->cork.opt->dst1opt);
1428                 kfree(np->cork.opt->hopopt);
1429                 kfree(np->cork.opt->srcrt);
1430                 kfree(np->cork.opt);
1431                 np->cork.opt = NULL;
1432         }
1433
1434         if (inet->cork.dst) {
1435                 dst_release(inet->cork.dst);
1436                 inet->cork.dst = NULL;
1437                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1438         }
1439         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1440 }
1441
1442 int ip6_push_pending_frames(struct sock *sk)
1443 {
1444         struct sk_buff *skb, *tmp_skb;
1445         struct sk_buff **tail_skb;
1446         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1447         struct inet_sock *inet = inet_sk(sk);
1448         struct ipv6_pinfo *np = inet6_sk(sk);
1449         struct net *net = sock_net(sk);
1450         struct ipv6hdr *hdr;
1451         struct ipv6_txoptions *opt = np->cork.opt;
1452         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1453         struct flowi *fl = &inet->cork.fl;
1454         unsigned char proto = fl->proto;
1455         int err = 0;
1456
1457         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1458                 goto out;
1459         tail_skb = &(skb_shinfo(skb)->frag_list);
1460
1461         /* move skb->data to ip header from ext header */
1462         if (skb->data < skb_network_header(skb))
1463                 __skb_pull(skb, skb_network_offset(skb));
1464         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1465                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1466                 *tail_skb = tmp_skb;
1467                 tail_skb = &(tmp_skb->next);
1468                 skb->len += tmp_skb->len;
1469                 skb->data_len += tmp_skb->len;
1470                 skb->truesize += tmp_skb->truesize;
1471                 tmp_skb->destructor = NULL;
1472                 tmp_skb->sk = NULL;
1473         }
1474
1475         /* Allow local fragmentation. */
1476         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1477                 skb->local_df = 1;
1478
1479         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1480         __skb_pull(skb, skb_network_header_len(skb));
1481         if (opt && opt->opt_flen)
1482                 ipv6_push_frag_opts(skb, opt, &proto);
1483         if (opt && opt->opt_nflen)
1484                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1485
1486         skb_push(skb, sizeof(struct ipv6hdr));
1487         skb_reset_network_header(skb);
1488         hdr = ipv6_hdr(skb);
1489
1490         *(__be32*)hdr = fl->fl6_flowlabel |
1491                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1492
1493         hdr->hop_limit = np->cork.hop_limit;
1494         hdr->nexthdr = proto;
1495         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1496         ipv6_addr_copy(&hdr->daddr, final_dst);
1497
1498         skb->priority = sk->sk_priority;
1499         skb->mark = sk->sk_mark;
1500
1501         skb_dst_set(skb, dst_clone(&rt->u.dst));
1502         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1503         if (proto == IPPROTO_ICMPV6) {
1504                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1505
1506                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1507                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1508         }
1509
1510         err = ip6_local_out(skb);
1511         if (err) {
1512                 if (err > 0)
1513                         err = net_xmit_errno(err);
1514                 if (err)
1515                         goto error;
1516         }
1517
1518 out:
1519         ip6_cork_release(inet, np);
1520         return err;
1521 error:
1522         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1523         goto out;
1524 }
1525
1526 void ip6_flush_pending_frames(struct sock *sk)
1527 {
1528         struct sk_buff *skb;
1529
1530         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1531                 if (skb_dst(skb))
1532                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1533                                       IPSTATS_MIB_OUTDISCARDS);
1534                 kfree_skb(skb);
1535         }
1536
1537         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1538 }