net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 int __ip6_local_out(struct sk_buff *skb)
  61 {
  62         int len;
  63
  64         len = skb->len - sizeof(struct ipv6hdr);
  65         if (len > IPV6_MAXPLEN)
  66                 len = 0;
  67         ipv6_hdr(skb)->payload_len = htons(len);
  68
  69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  70                        dst_output);
  71 }
  72
  73 int ip6_local_out(struct sk_buff *skb)
  74 {
  75         int err;
  76
  77         err = __ip6_local_out(skb);
  78         if (likely(err == 1))
  79                 err = dst_output(skb);
  80
  81         return err;
  82 }
  83 EXPORT_SYMBOL_GPL(ip6_local_out);
  84
  85 static int ip6_output_finish(struct sk_buff *skb)
  86 {
  87         struct dst_entry *dst = skb_dst(skb);
  88
  89         if (dst->hh)
  90                 return neigh_hh_output(dst->hh, skb);
  91         else if (dst->neighbour)
  92                 return dst->neighbour->output(skb);
  93
  94         IP6_INC_STATS_BH(dev_net(dst->dev),
  95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  96         kfree_skb(skb);
  97         return -EINVAL;
  98
  99 }
 100
 101 /* dev_loopback_xmit for use with netfilter. */
 102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 103 {
 104         skb_reset_mac_header(newskb);
 105         __skb_pull(newskb, skb_network_offset(newskb));
 106         newskb->pkt_type = PACKET_LOOPBACK;
 107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 108         WARN_ON(!skb_dst(newskb));
 109
 110         netif_rx(newskb);
 111         return 0;
 112 }
 113
 114
 115 static int ip6_output2(struct sk_buff *skb)
 116 {
 117         struct dst_entry *dst = skb_dst(skb);
 118         struct net_device *dev = dst->dev;
 119
 120         skb->protocol = htons(ETH_P_IPV6);
 121         skb->dev = dev;
 122
 123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 124                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 125
 126                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 127                     ((mroute6_socket(dev_net(dev)) &&
 128                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 129                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 130                                          &ipv6_hdr(skb)->saddr))) {
 131                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 132
 133                         /* Do not check for IFF_ALLMULTI; multicast routing
 134                            is not supported in any case.
 135                          */
 136                         if (newskb)
 137                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 138                                         NULL, newskb->dev,
 139                                         ip6_dev_loopback_xmit);
 140
 141                         if (ipv6_hdr(skb)->hop_limit == 0) {
 142                                 IP6_INC_STATS(dev_net(dev), idev,
 143                                               IPSTATS_MIB_OUTDISCARDS);
 144                                 kfree_skb(skb);
 145                                 return 0;
 146                         }
 147                 }
 148
 149                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 150                                 skb->len);
 151         }
 152
 153         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 154                        ip6_output_finish);
 155 }
 156
 157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 158 {
 159         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 160
 161         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 162                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 163 }
 164
 165 int ip6_output(struct sk_buff *skb)
 166 {
 167         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 168         if (unlikely(idev->cnf.disable_ipv6)) {
 169                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 170                               IPSTATS_MIB_OUTDISCARDS);
 171                 kfree_skb(skb);
 172                 return 0;
 173         }
 174
 175         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 176                                 dst_allfrag(skb_dst(skb)))
 177                 return ip6_fragment(skb, ip6_output2);
 178         else
 179                 return ip6_output2(skb);
 180 }
 181
 182 /*
 183  *      xmit an sk_buff (used by TCP)
 184  */
 185
 186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 187              struct ipv6_txoptions *opt, int ipfragok)
 188 {
 189         struct net *net = sock_net(sk);
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb_dst(skb);
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit = -1;
 197         int tclass = 0;
 198         u32 mtu;
 199
 200         if (opt) {
 201                 unsigned int head_room;
 202
 203                 /* First: exthdrs may take lots of space (~8K for now)
 204                    MAX_HEADER is not enough.
 205                  */
 206                 head_room = opt->opt_nflen + opt->opt_flen;
 207                 seg_len += head_room;
 208                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 209
 210                 if (skb_headroom(skb) < head_room) {
 211                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 212                         if (skb2 == NULL) {
 213                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 214                                               IPSTATS_MIB_OUTDISCARDS);
 215                                 kfree_skb(skb);
 216                                 return -ENOBUFS;
 217                         }
 218                         kfree_skb(skb);
 219                         skb = skb2;
 220                         if (sk)
 221                                 skb_set_owner_w(skb, sk);
 222                 }
 223                 if (opt->opt_flen)
 224                         ipv6_push_frag_opts(skb, opt, &proto);
 225                 if (opt->opt_nflen)
 226                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 227         }
 228
 229         skb_push(skb, sizeof(struct ipv6hdr));
 230         skb_reset_network_header(skb);
 231         hdr = ipv6_hdr(skb);
 232
 233         /* Allow local fragmentation. */
 234         if (ipfragok)
 235                 skb->local_df = 1;
 236
 237         /*
 238          *      Fill in the IPv6 header
 239          */
 240         if (np) {
 241                 tclass = np->tclass;
 242                 hlimit = np->hop_limit;
 243         }
 244         if (hlimit < 0)
 245                 hlimit = ip6_dst_hoplimit(dst);
 246
 247         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 254         ipv6_addr_copy(&hdr->daddr, first_hop);
 255
 256         skb->priority = sk->sk_priority;
 257         skb->mark = sk->sk_mark;
 258
 259         mtu = dst_mtu(dst);
 260         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 261                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 262                               IPSTATS_MIB_OUT, skb->len);
 263                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 264                                 dst_output);
 265         }
 266
 267         if (net_ratelimit())
 268                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 269         skb->dev = dst->dev;
 270         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 272         kfree_skb(skb);
 273         return -EMSGSIZE;
 274 }
 275
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 /*
 279  *      To avoid extra problems ND packets are send through this
 280  *      routine. It's code duplication but I really want to avoid
 281  *      extra checks since ipv6_build_header is used by TCP (which
 282  *      is for us performance critical)
 283  */
 284
 285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 286                const struct in6_addr *saddr, const struct in6_addr *daddr,
 287                int proto, int len)
 288 {
 289         struct ipv6_pinfo *np = inet6_sk(sk);
 290         struct ipv6hdr *hdr;
 291         int totlen;
 292
 293         skb->protocol = htons(ETH_P_IPV6);
 294         skb->dev = dev;
 295
 296         totlen = len + sizeof(struct ipv6hdr);
 297
 298         skb_reset_network_header(skb);
 299         skb_put(skb, sizeof(struct ipv6hdr));
 300         hdr = ipv6_hdr(skb);
 301
 302         *(__be32*)hdr = htonl(0x60000000);
 303
 304         hdr->payload_len = htons(len);
 305         hdr->nexthdr = proto;
 306         hdr->hop_limit = np->hop_limit;
 307
 308         ipv6_addr_copy(&hdr->saddr, saddr);
 309         ipv6_addr_copy(&hdr->daddr, daddr);
 310
 311         return 0;
 312 }
 313
 314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 315 {
 316         struct ip6_ra_chain *ra;
 317         struct sock *last = NULL;
 318
 319         read_lock(&ip6_ra_lock);
 320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 321                 struct sock *sk = ra->sk;
 322                 if (sk && ra->sel == sel &&
 323                     (!sk->sk_bound_dev_if ||
 324                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static int ip6_forward_proxy_check(struct sk_buff *skb)
 344 {
 345         struct ipv6hdr *hdr = ipv6_hdr(skb);
 346         u8 nexthdr = hdr->nexthdr;
 347         int offset;
 348
 349         if (ipv6_ext_hdr(nexthdr)) {
 350                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 351                 if (offset < 0)
 352                         return 0;
 353         } else
 354                 offset = sizeof(struct ipv6hdr);
 355
 356         if (nexthdr == IPPROTO_ICMPV6) {
 357                 struct icmp6hdr *icmp6;
 358
 359                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 360                                          offset + 1 - skb->data)))
 361                         return 0;
 362
 363                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 364
 365                 switch (icmp6->icmp6_type) {
 366                 case NDISC_ROUTER_SOLICITATION:
 367                 case NDISC_ROUTER_ADVERTISEMENT:
 368                 case NDISC_NEIGHBOUR_SOLICITATION:
 369                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 370                 case NDISC_REDIRECT:
 371                         /* For reaction involving unicast neighbor discovery
 372                          * message destined to the proxied address, pass it to
 373                          * input function.
 374                          */
 375                         return 1;
 376                 default:
 377                         break;
 378                 }
 379         }
 380
 381         /*
 382          * The proxying router can't forward traffic sent to a link-local
 383          * address, so signal the sender and discard the packet. This
 384          * behavior is clarified by the MIPv6 specification.
 385          */
 386         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 387                 dst_link_failure(skb);
 388                 return -1;
 389         }
 390
 391         return 0;
 392 }
 393
 394 static inline int ip6_forward_finish(struct sk_buff *skb)
 395 {
 396         return dst_output(skb);
 397 }
 398
 399 int ip6_forward(struct sk_buff *skb)
 400 {
 401         struct dst_entry *dst = skb_dst(skb);
 402         struct ipv6hdr *hdr = ipv6_hdr(skb);
 403         struct inet6_skb_parm *opt = IP6CB(skb);
 404         struct net *net = dev_net(dst->dev);
 405         u32 mtu;
 406
 407         if (net->ipv6.devconf_all->forwarding == 0)
 408                 goto error;
 409
 410         if (skb_warn_if_lro(skb))
 411                 goto drop;
 412
 413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 414                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 415                 goto drop;
 416         }
 417
 418         skb_forward_csum(skb);
 419
 420         /*
 421          *      We DO NOT make any processing on
 422          *      RA packets, pushing them to user level AS IS
 423          *      without ane WARRANTY that application will be able
 424          *      to interpret them. The reason is that we
 425          *      cannot make anything clever here.
 426          *
 427          *      We are not end-node, so that if packet contains
 428          *      AH/ESP, we cannot make anything.
 429          *      Defragmentation also would be mistake, RA packets
 430          *      cannot be fragmented, because there is no warranty
 431          *      that different fragments will go along one path. --ANK
 432          */
 433         if (opt->ra) {
 434                 u8 *ptr = skb_network_header(skb) + opt->ra;
 435                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 436                         return 0;
 437         }
 438
 439         /*
 440          *      check and decrement ttl
 441          */
 442         if (hdr->hop_limit <= 1) {
 443                 /* Force OUTPUT device used as source address */
 444                 skb->dev = dst->dev;
 445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 446                 IP6_INC_STATS_BH(net,
 447                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 448
 449                 kfree_skb(skb);
 450                 return -ETIMEDOUT;
 451         }
 452
 453         /* XXX: idev->cnf.proxy_ndp? */
 454         if (net->ipv6.devconf_all->proxy_ndp &&
 455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 456                 int proxied = ip6_forward_proxy_check(skb);
 457                 if (proxied > 0)
 458                         return ip6_input(skb);
 459                 else if (proxied < 0) {
 460                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 461                                       IPSTATS_MIB_INDISCARDS);
 462                         goto drop;
 463                 }
 464         }
 465
 466         if (!xfrm6_route_forward(skb)) {
 467                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 468                 goto drop;
 469         }
 470         dst = skb_dst(skb);
 471
 472         /* IPv6 specs say nothing about it, but it is clear that we cannot
 473            send redirects to source routed frames.
 474            We don't send redirects to frames decapsulated from IPsec.
 475          */
 476         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 477             !skb_sec_path(skb)) {
 478                 struct in6_addr *target = NULL;
 479                 struct rt6_info *rt;
 480                 struct neighbour *n = dst->neighbour;
 481
 482                 /*
 483                  *      incoming and outgoing devices are the same
 484                  *      send a redirect.
 485                  */
 486
 487                 rt = (struct rt6_info *) dst;
 488                 if ((rt->rt6i_flags & RTF_GATEWAY))
 489                         target = (struct in6_addr*)&n->primary_key;
 490                 else
 491                         target = &hdr->daddr;
 492
 493                 /* Limit redirects both by destination (here)
 494                    and by source (inside ndisc_send_redirect)
 495                  */
 496                 if (xrlim_allow(dst, 1*HZ))
 497                         ndisc_send_redirect(skb, n, target);
 498         } else {
 499                 int addrtype = ipv6_addr_type(&hdr->saddr);
 500
 501                 /* This check is security critical. */
 502                 if (addrtype == IPV6_ADDR_ANY ||
 503                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 504                         goto error;
 505                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 506                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 507                                     ICMPV6_NOT_NEIGHBOUR, 0);
 508                         goto error;
 509                 }
 510         }
 511
 512         mtu = dst_mtu(dst);
 513         if (mtu < IPV6_MIN_MTU)
 514                 mtu = IPV6_MIN_MTU;
 515
 516         if (skb->len > mtu) {
 517                 /* Again, force OUTPUT device used as source address */
 518                 skb->dev = dst->dev;
 519                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 520                 IP6_INC_STATS_BH(net,
 521                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 522                 IP6_INC_STATS_BH(net,
 523                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 524                 kfree_skb(skb);
 525                 return -EMSGSIZE;
 526         }
 527
 528         if (skb_cow(skb, dst->dev->hard_header_len)) {
 529                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 530                 goto drop;
 531         }
 532
 533         hdr = ipv6_hdr(skb);
 534
 535         /* Mangling hops number delayed to point after skb COW */
 536
 537         hdr->hop_limit--;
 538
 539         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 540         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 541                        ip6_forward_finish);
 542
 543 error:
 544         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 545 drop:
 546         kfree_skb(skb);
 547         return -EINVAL;
 548 }
 549
 550 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 551 {
 552         to->pkt_type = from->pkt_type;
 553         to->priority = from->priority;
 554         to->protocol = from->protocol;
 555         skb_dst_drop(to);
 556         skb_dst_set(to, dst_clone(skb_dst(from)));
 557         to->dev = from->dev;
 558         to->mark = from->mark;
 559
 560 #ifdef CONFIG_NET_SCHED
 561         to->tc_index = from->tc_index;
 562 #endif
 563         nf_copy(to, from);
 564 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 565     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 566         to->nf_trace = from->nf_trace;
 567 #endif
 568         skb_copy_secmark(to, from);
 569 }
 570
 571 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 572 {
 573         u16 offset = sizeof(struct ipv6hdr);
 574         struct ipv6_opt_hdr *exthdr =
 575                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 576         unsigned int packet_len = skb->tail - skb->network_header;
 577         int found_rhdr = 0;
 578         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 579
 580         while (offset + 1 <= packet_len) {
 581
 582                 switch (**nexthdr) {
 583
 584                 case NEXTHDR_HOP:
 585                         break;
 586                 case NEXTHDR_ROUTING:
 587                         found_rhdr = 1;
 588                         break;
 589                 case NEXTHDR_DEST:
 590 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 591                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 592                                 break;
 593 #endif
 594                         if (found_rhdr)
 595                                 return offset;
 596                         break;
 597                 default :
 598                         return offset;
 599                 }
 600
 601                 offset += ipv6_optlen(exthdr);
 602                 *nexthdr = &exthdr->nexthdr;
 603                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 604                                                  offset);
 605         }
 606
 607         return offset;
 608 }
 609
 610 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 611 {
 612         struct sk_buff *frag;
 613         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 614         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 615         struct ipv6hdr *tmp_hdr;
 616         struct frag_hdr *fh;
 617         unsigned int mtu, hlen, left, len;
 618         __be32 frag_id = 0;
 619         int ptr, offset = 0, err=0;
 620         u8 *prevhdr, nexthdr = 0;
 621         struct net *net = dev_net(skb_dst(skb)->dev);
 622
 623         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 624         nexthdr = *prevhdr;
 625
 626         mtu = ip6_skb_dst_mtu(skb);
 627
 628         /* We must not fragment if the socket is set to force MTU discovery
 629          * or if the skb it not generated by a local socket.
 630          */
 631         if (!skb->local_df) {
 632                 skb->dev = skb_dst(skb)->dev;
 633                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 634                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 635                               IPSTATS_MIB_FRAGFAILS);
 636                 kfree_skb(skb);
 637                 return -EMSGSIZE;
 638         }
 639
 640         if (np && np->frag_size < mtu) {
 641                 if (np->frag_size)
 642                         mtu = np->frag_size;
 643         }
 644         mtu -= hlen + sizeof(struct frag_hdr);
 645
 646         if (skb_has_frags(skb)) {
 647                 int first_len = skb_pagelen(skb);
 648                 int truesizes = 0;
 649
 650                 if (first_len - hlen > mtu ||
 651                     ((first_len - hlen) & 7) ||
 652                     skb_cloned(skb))
 653                         goto slow_path;
 654
 655                 skb_walk_frags(skb, frag) {
 656                         /* Correct geometry. */
 657                         if (frag->len > mtu ||
 658                             ((frag->len & 7) && frag->next) ||
 659                             skb_headroom(frag) < hlen)
 660                             goto slow_path;
 661
 662                         /* Partially cloned skb? */
 663                         if (skb_shared(frag))
 664                                 goto slow_path;
 665
 666                         BUG_ON(frag->sk);
 667                         if (skb->sk) {
 668                                 frag->sk = skb->sk;
 669                                 frag->destructor = sock_wfree;
 670                                 truesizes += frag->truesize;
 671                         }
 672                 }
 673
 674                 err = 0;
 675                 offset = 0;
 676                 frag = skb_shinfo(skb)->frag_list;
 677                 skb_frag_list_init(skb);
 678                 /* BUILD HEADER */
 679
 680                 *prevhdr = NEXTHDR_FRAGMENT;
 681                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 682                 if (!tmp_hdr) {
 683                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 684                                       IPSTATS_MIB_FRAGFAILS);
 685                         return -ENOMEM;
 686                 }
 687
 688                 __skb_pull(skb, hlen);
 689                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 690                 __skb_push(skb, hlen);
 691                 skb_reset_network_header(skb);
 692                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 693
 694                 ipv6_select_ident(fh);
 695                 fh->nexthdr = nexthdr;
 696                 fh->reserved = 0;
 697                 fh->frag_off = htons(IP6_MF);
 698                 frag_id = fh->identification;
 699
 700                 first_len = skb_pagelen(skb);
 701                 skb->data_len = first_len - skb_headlen(skb);
 702                 skb->truesize -= truesizes;
 703                 skb->len = first_len;
 704                 ipv6_hdr(skb)->payload_len = htons(first_len -
 705                                                    sizeof(struct ipv6hdr));
 706
 707                 dst_hold(&rt->u.dst);
 708
 709                 for (;;) {
 710                         /* Prepare header of the next frame,
 711                          * before previous one went down. */
 712                         if (frag) {
 713                                 frag->ip_summed = CHECKSUM_NONE;
 714                                 skb_reset_transport_header(frag);
 715                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 716                                 __skb_push(frag, hlen);
 717                                 skb_reset_network_header(frag);
 718                                 memcpy(skb_network_header(frag), tmp_hdr,
 719                                        hlen);
 720                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 721                                 fh->nexthdr = nexthdr;
 722                                 fh->reserved = 0;
 723                                 fh->frag_off = htons(offset);
 724                                 if (frag->next != NULL)
 725                                         fh->frag_off |= htons(IP6_MF);
 726                                 fh->identification = frag_id;
 727                                 ipv6_hdr(frag)->payload_len =
 728                                                 htons(frag->len -
 729                                                       sizeof(struct ipv6hdr));
 730                                 ip6_copy_metadata(frag, skb);
 731                         }
 732
 733                         err = output(skb);
 734                         if(!err)
 735                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 736                                               IPSTATS_MIB_FRAGCREATES);
 737
 738                         if (err || !frag)
 739                                 break;
 740
 741                         skb = frag;
 742                         frag = skb->next;
 743                         skb->next = NULL;
 744                 }
 745
 746                 kfree(tmp_hdr);
 747
 748                 if (err == 0) {
 749                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 750                                       IPSTATS_MIB_FRAGOKS);
 751                         dst_release(&rt->u.dst);
 752                         return 0;
 753                 }
 754
 755                 while (frag) {
 756                         skb = frag->next;
 757                         kfree_skb(frag);
 758                         frag = skb;
 759                 }
 760
 761                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 762                               IPSTATS_MIB_FRAGFAILS);
 763                 dst_release(&rt->u.dst);
 764                 return err;
 765         }
 766
 767 slow_path:
 768         left = skb->len - hlen;         /* Space per frame */
 769         ptr = hlen;                     /* Where to start from */
 770
 771         /*
 772          *      Fragment the datagram.
 773          */
 774
 775         *prevhdr = NEXTHDR_FRAGMENT;
 776
 777         /*
 778          *      Keep copying data until we run out.
 779          */
 780         while(left > 0) {
 781                 len = left;
 782                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 783                 if (len > mtu)
 784                         len = mtu;
 785                 /* IF: we are not sending upto and including the packet end
 786                    then align the next start on an eight byte boundary */
 787                 if (len < left) {
 788                         len &= ~7;
 789                 }
 790                 /*
 791                  *      Allocate buffer.
 792                  */
 793
 794                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 795                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 796                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 797                                       IPSTATS_MIB_FRAGFAILS);
 798                         err = -ENOMEM;
 799                         goto fail;
 800                 }
 801
 802                 /*
 803                  *      Set up data on packet
 804                  */
 805
 806                 ip6_copy_metadata(frag, skb);
 807                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 808                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 809                 skb_reset_network_header(frag);
 810                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 811                 frag->transport_header = (frag->network_header + hlen +
 812                                           sizeof(struct frag_hdr));
 813
 814                 /*
 815                  *      Charge the memory for the fragment to any owner
 816                  *      it might possess
 817                  */
 818                 if (skb->sk)
 819                         skb_set_owner_w(frag, skb->sk);
 820
 821                 /*
 822                  *      Copy the packet header into the new buffer.
 823                  */
 824                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 825
 826                 /*
 827                  *      Build fragment header.
 828                  */
 829                 fh->nexthdr = nexthdr;
 830                 fh->reserved = 0;
 831                 if (!frag_id) {
 832                         ipv6_select_ident(fh);
 833                         frag_id = fh->identification;
 834                 } else
 835                         fh->identification = frag_id;
 836
 837                 /*
 838                  *      Copy a block of the IP datagram.
 839                  */
 840                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 841                         BUG();
 842                 left -= len;
 843
 844                 fh->frag_off = htons(offset);
 845                 if (left > 0)
 846                         fh->frag_off |= htons(IP6_MF);
 847                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 848                                                     sizeof(struct ipv6hdr));
 849
 850                 ptr += len;
 851                 offset += len;
 852
 853                 /*
 854                  *      Put this fragment into the sending queue.
 855                  */
 856                 err = output(frag);
 857                 if (err)
 858                         goto fail;
 859
 860                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 861                               IPSTATS_MIB_FRAGCREATES);
 862         }
 863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                       IPSTATS_MIB_FRAGOKS);
 865         kfree_skb(skb);
 866         return err;
 867
 868 fail:
 869         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 870                       IPSTATS_MIB_FRAGFAILS);
 871         kfree_skb(skb);
 872         return err;
 873 }
 874
 875 static inline int ip6_rt_check(struct rt6key *rt_key,
 876                                struct in6_addr *fl_addr,
 877                                struct in6_addr *addr_cache)
 878 {
 879         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 880                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 881 }
 882
 883 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 884                                           struct dst_entry *dst,
 885                                           struct flowi *fl)
 886 {
 887         struct ipv6_pinfo *np = inet6_sk(sk);
 888         struct rt6_info *rt = (struct rt6_info *)dst;
 889
 890         if (!dst)
 891                 goto out;
 892
 893         /* Yes, checking route validity in not connected
 894          * case is not very simple. Take into account,
 895          * that we do not support routing by source, TOS,
 896          * and MSG_DONTROUTE            --ANK (980726)
 897          *
 898          * 1. ip6_rt_check(): If route was host route,
 899          *    check that cached destination is current.
 900          *    If it is network route, we still may
 901          *    check its validity using saved pointer
 902          *    to the last used address: daddr_cache.
 903          *    We do not want to save whole address now,
 904          *    (because main consumer of this service
 905          *    is tcp, which has not this problem),
 906          *    so that the last trick works only on connected
 907          *    sockets.
 908          * 2. oif also should be the same.
 909          */
 910         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 911 #ifdef CONFIG_IPV6_SUBTREES
 912             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 913 #endif
 914             (fl->oif && fl->oif != dst->dev->ifindex)) {
 915                 dst_release(dst);
 916                 dst = NULL;
 917         }
 918
 919 out:
 920         return dst;
 921 }
 922
 923 static int ip6_dst_lookup_tail(struct sock *sk,
 924                                struct dst_entry **dst, struct flowi *fl)
 925 {
 926         int err;
 927         struct net *net = sock_net(sk);
 928
 929         if (*dst == NULL)
 930                 *dst = ip6_route_output(net, sk, fl);
 931
 932         if ((err = (*dst)->error))
 933                 goto out_err_release;
 934
 935         if (ipv6_addr_any(&fl->fl6_src)) {
 936                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 937                                          &fl->fl6_dst,
 938                                          sk ? inet6_sk(sk)->srcprefs : 0,
 939                                          &fl->fl6_src);
 940                 if (err)
 941                         goto out_err_release;
 942         }
 943
 944 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 945         /*
 946          * Here if the dst entry we've looked up
 947          * has a neighbour entry that is in the INCOMPLETE
 948          * state and the src address from the flow is
 949          * marked as OPTIMISTIC, we release the found
 950          * dst entry and replace it instead with the
 951          * dst entry of the nexthop router
 952          */
 953         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 954                 struct inet6_ifaddr *ifp;
 955                 struct flowi fl_gw;
 956                 int redirect;
 957
 958                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 959                                       (*dst)->dev, 1);
 960
 961                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 962                 if (ifp)
 963                         in6_ifa_put(ifp);
 964
 965                 if (redirect) {
 966                         /*
 967                          * We need to get the dst entry for the
 968                          * default router instead
 969                          */
 970                         dst_release(*dst);
 971                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 972                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 973                         *dst = ip6_route_output(net, sk, &fl_gw);
 974                         if ((err = (*dst)->error))
 975                                 goto out_err_release;
 976                 }
 977         }
 978 #endif
 979
 980         return 0;
 981
 982 out_err_release:
 983         if (err == -ENETUNREACH)
 984                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 985         dst_release(*dst);
 986         *dst = NULL;
 987         return err;
 988 }
 989
 990 /**
 991  *      ip6_dst_lookup - perform route lookup on flow
 992  *      @sk: socket which provides route info
 993  *      @dst: pointer to dst_entry * for result
 994  *      @fl: flow to lookup
 995  *
 996  *      This function performs a route lookup on the given flow.
 997  *
 998  *      It returns zero on success, or a standard errno code on error.
 999  */
1000 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1001 {
1002         *dst = NULL;
1003         return ip6_dst_lookup_tail(sk, dst, fl);
1004 }
1005 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1006
1007 /**
1008  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1009  *      @sk: socket which provides the dst cache and route info
1010  *      @dst: pointer to dst_entry * for result
1011  *      @fl: flow to lookup
1012  *
1013  *      This function performs a route lookup on the given flow with the
1014  *      possibility of using the cached route in the socket if it is valid.
1015  *      It will take the socket dst lock when operating on the dst cache.
1016  *      As a result, this function can only be used in process context.
1017  *
1018  *      It returns zero on success, or a standard errno code on error.
1019  */
1020 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1021 {
1022         *dst = NULL;
1023         if (sk) {
1024                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1025                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1026         }
1027
1028         return ip6_dst_lookup_tail(sk, dst, fl);
1029 }
1030 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1031
1032 static inline int ip6_ufo_append_data(struct sock *sk,
1033                         int getfrag(void *from, char *to, int offset, int len,
1034                         int odd, struct sk_buff *skb),
1035                         void *from, int length, int hh_len, int fragheaderlen,
1036                         int transhdrlen, int mtu,unsigned int flags)
1037
1038 {
1039         struct sk_buff *skb;
1040         int err;
1041
1042         /* There is support for UDP large send offload by network
1043          * device, so create one single skb packet containing complete
1044          * udp datagram
1045          */
1046         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1047                 skb = sock_alloc_send_skb(sk,
1048                         hh_len + fragheaderlen + transhdrlen + 20,
1049                         (flags & MSG_DONTWAIT), &err);
1050                 if (skb == NULL)
1051                         return -ENOMEM;
1052
1053                 /* reserve space for Hardware header */
1054                 skb_reserve(skb, hh_len);
1055
1056                 /* create space for UDP/IP header */
1057                 skb_put(skb,fragheaderlen + transhdrlen);
1058
1059                 /* initialize network header pointer */
1060                 skb_reset_network_header(skb);
1061
1062                 /* initialize protocol header pointer */
1063                 skb->transport_header = skb->network_header + fragheaderlen;
1064
1065                 skb->ip_summed = CHECKSUM_PARTIAL;
1066                 skb->csum = 0;
1067                 sk->sk_sndmsg_off = 0;
1068         }
1069
1070         err = skb_append_datato_frags(sk,skb, getfrag, from,
1071                                       (length - transhdrlen));
1072         if (!err) {
1073                 struct frag_hdr fhdr;
1074
1075                 /* Specify the length of each IPv6 datagram fragment.
1076                  * It has to be a multiple of 8.
1077                  */
1078                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1079                                              sizeof(struct frag_hdr)) & ~7;
1080                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1081                 ipv6_select_ident(&fhdr);
1082                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1083                 __skb_queue_tail(&sk->sk_write_queue, skb);
1084
1085                 return 0;
1086         }
1087         /* There is not enough support do UPD LSO,
1088          * so follow normal path
1089          */
1090         kfree_skb(skb);
1091
1092         return err;
1093 }
1094
1095 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1096                                                gfp_t gfp)
1097 {
1098         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100
1101 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1102                                                 gfp_t gfp)
1103 {
1104         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1105 }
1106
1107 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1108         int offset, int len, int odd, struct sk_buff *skb),
1109         void *from, int length, int transhdrlen,
1110         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1111         struct rt6_info *rt, unsigned int flags)
1112 {
1113         struct inet_sock *inet = inet_sk(sk);
1114         struct ipv6_pinfo *np = inet6_sk(sk);
1115         struct sk_buff *skb;
1116         unsigned int maxfraglen, fragheaderlen;
1117         int exthdrlen;
1118         int hh_len;
1119         int mtu;
1120         int copy;
1121         int err;
1122         int offset = 0;
1123         int csummode = CHECKSUM_NONE;
1124
1125         if (flags&MSG_PROBE)
1126                 return 0;
1127         if (skb_queue_empty(&sk->sk_write_queue)) {
1128                 /*
1129                  * setup for corking
1130                  */
1131                 if (opt) {
1132                         if (WARN_ON(np->cork.opt))
1133                                 return -EINVAL;
1134
1135                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1136                         if (unlikely(np->cork.opt == NULL))
1137                                 return -ENOBUFS;
1138
1139                         np->cork.opt->tot_len = opt->tot_len;
1140                         np->cork.opt->opt_flen = opt->opt_flen;
1141                         np->cork.opt->opt_nflen = opt->opt_nflen;
1142
1143                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1144                                                             sk->sk_allocation);
1145                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1146                                 return -ENOBUFS;
1147
1148                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1149                                                             sk->sk_allocation);
1150                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1151                                 return -ENOBUFS;
1152
1153                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1154                                                            sk->sk_allocation);
1155                         if (opt->hopopt && !np->cork.opt->hopopt)
1156                                 return -ENOBUFS;
1157
1158                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1159                                                             sk->sk_allocation);
1160                         if (opt->srcrt && !np->cork.opt->srcrt)
1161                                 return -ENOBUFS;
1162
1163                         /* need source address above miyazawa*/
1164                 }
1165                 dst_hold(&rt->u.dst);
1166                 inet->cork.dst = &rt->u.dst;
1167                 inet->cork.fl = *fl;
1168                 np->cork.hop_limit = hlimit;
1169                 np->cork.tclass = tclass;
1170                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1171                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1172                 if (np->frag_size < mtu) {
1173                         if (np->frag_size)
1174                                 mtu = np->frag_size;
1175                 }
1176                 inet->cork.fragsize = mtu;
1177                 if (dst_allfrag(rt->u.dst.path))
1178                         inet->cork.flags |= IPCORK_ALLFRAG;
1179                 inet->cork.length = 0;
1180                 sk->sk_sndmsg_page = NULL;
1181                 sk->sk_sndmsg_off = 0;
1182                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1183                             rt->rt6i_nfheader_len;
1184                 length += exthdrlen;
1185                 transhdrlen += exthdrlen;
1186         } else {
1187                 rt = (struct rt6_info *)inet->cork.dst;
1188                 fl = &inet->cork.fl;
1189                 opt = np->cork.opt;
1190                 transhdrlen = 0;
1191                 exthdrlen = 0;
1192                 mtu = inet->cork.fragsize;
1193         }
1194
1195         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1196
1197         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1198                         (opt ? opt->opt_nflen : 0);
1199         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1200
1201         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1202                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1203                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1204                         return -EMSGSIZE;
1205                 }
1206         }
1207
1208         /*
1209          * Let's try using as much space as possible.
1210          * Use MTU if total length of the message fits into the MTU.
1211          * Otherwise, we need to reserve fragment header and
1212          * fragment alignment (= 8-15 octects, in total).
1213          *
1214          * Note that we may need to "move" the data from the tail of
1215          * of the buffer to the new fragment when we split
1216          * the message.
1217          *
1218          * FIXME: It may be fragmented into multiple chunks
1219          *        at once if non-fragmentable extension headers
1220          *        are too large.
1221          * --yoshfuji
1222          */
1223
1224         inet->cork.length += length;
1225         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1226             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1227
1228                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1229                                           fragheaderlen, transhdrlen, mtu,
1230                                           flags);
1231                 if (err)
1232                         goto error;
1233                 return 0;
1234         }
1235
1236         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1237                 goto alloc_new_skb;
1238
1239         while (length > 0) {
1240                 /* Check if the remaining data fits into current packet. */
1241                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1242                 if (copy < length)
1243                         copy = maxfraglen - skb->len;
1244
1245                 if (copy <= 0) {
1246                         char *data;
1247                         unsigned int datalen;
1248                         unsigned int fraglen;
1249                         unsigned int fraggap;
1250                         unsigned int alloclen;
1251                         struct sk_buff *skb_prev;
1252 alloc_new_skb:
1253                         skb_prev = skb;
1254
1255                         /* There's no room in the current skb */
1256                         if (skb_prev)
1257                                 fraggap = skb_prev->len - maxfraglen;
1258                         else
1259                                 fraggap = 0;
1260
1261                         /*
1262                          * If remaining data exceeds the mtu,
1263                          * we know we need more fragment(s).
1264                          */
1265                         datalen = length + fraggap;
1266                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1267                                 datalen = maxfraglen - fragheaderlen;
1268
1269                         fraglen = datalen + fragheaderlen;
1270                         if ((flags & MSG_MORE) &&
1271                             !(rt->u.dst.dev->features&NETIF_F_SG))
1272                                 alloclen = mtu;
1273                         else
1274                                 alloclen = datalen + fragheaderlen;
1275
1276                         /*
1277                          * The last fragment gets additional space at tail.
1278                          * Note: we overallocate on fragments with MSG_MODE
1279                          * because we have no idea if we're the last one.
1280                          */
1281                         if (datalen == length + fraggap)
1282                                 alloclen += rt->u.dst.trailer_len;
1283
1284                         /*
1285                          * We just reserve space for fragment header.
1286                          * Note: this may be overallocation if the message
1287                          * (without MSG_MORE) fits into the MTU.
1288                          */
1289                         alloclen += sizeof(struct frag_hdr);
1290
1291                         if (transhdrlen) {
1292                                 skb = sock_alloc_send_skb(sk,
1293                                                 alloclen + hh_len,
1294                                                 (flags & MSG_DONTWAIT), &err);
1295                         } else {
1296                                 skb = NULL;
1297                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1298                                     2 * sk->sk_sndbuf)
1299                                         skb = sock_wmalloc(sk,
1300                                                            alloclen + hh_len, 1,
1301                                                            sk->sk_allocation);
1302                                 if (unlikely(skb == NULL))
1303                                         err = -ENOBUFS;
1304                         }
1305                         if (skb == NULL)
1306                                 goto error;
1307                         /*
1308                          *      Fill in the control structures
1309                          */
1310                         skb->ip_summed = csummode;
1311                         skb->csum = 0;
1312                         /* reserve for fragmentation */
1313                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1314
1315                         /*
1316                          *      Find where to start putting bytes
1317                          */
1318                         data = skb_put(skb, fraglen);
1319                         skb_set_network_header(skb, exthdrlen);
1320                         data += fragheaderlen;
1321                         skb->transport_header = (skb->network_header +
1322                                                  fragheaderlen);
1323                         if (fraggap) {
1324                                 skb->csum = skb_copy_and_csum_bits(
1325                                         skb_prev, maxfraglen,
1326                                         data + transhdrlen, fraggap, 0);
1327                                 skb_prev->csum = csum_sub(skb_prev->csum,
1328                                                           skb->csum);
1329                                 data += fraggap;
1330                                 pskb_trim_unique(skb_prev, maxfraglen);
1331                         }
1332                         copy = datalen - transhdrlen - fraggap;
1333                         if (copy < 0) {
1334                                 err = -EINVAL;
1335                                 kfree_skb(skb);
1336                                 goto error;
1337                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1338                                 err = -EFAULT;
1339                                 kfree_skb(skb);
1340                                 goto error;
1341                         }
1342
1343                         offset += copy;
1344                         length -= datalen - fraggap;
1345                         transhdrlen = 0;
1346                         exthdrlen = 0;
1347                         csummode = CHECKSUM_NONE;
1348
1349                         /*
1350                          * Put the packet on the pending queue
1351                          */
1352                         __skb_queue_tail(&sk->sk_write_queue, skb);
1353                         continue;
1354                 }
1355
1356                 if (copy > length)
1357                         copy = length;
1358
1359                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1360                         unsigned int off;
1361
1362                         off = skb->len;
1363                         if (getfrag(from, skb_put(skb, copy),
1364                                                 offset, copy, off, skb) < 0) {
1365                                 __skb_trim(skb, off);
1366                                 err = -EFAULT;
1367                                 goto error;
1368                         }
1369                 } else {
1370                         int i = skb_shinfo(skb)->nr_frags;
1371                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1372                         struct page *page = sk->sk_sndmsg_page;
1373                         int off = sk->sk_sndmsg_off;
1374                         unsigned int left;
1375
1376                         if (page && (left = PAGE_SIZE - off) > 0) {
1377                                 if (copy >= left)
1378                                         copy = left;
1379                                 if (page != frag->page) {
1380                                         if (i == MAX_SKB_FRAGS) {
1381                                                 err = -EMSGSIZE;
1382                                                 goto error;
1383                                         }
1384                                         get_page(page);
1385                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1386                                         frag = &skb_shinfo(skb)->frags[i];
1387                                 }
1388                         } else if(i < MAX_SKB_FRAGS) {
1389                                 if (copy > PAGE_SIZE)
1390                                         copy = PAGE_SIZE;
1391                                 page = alloc_pages(sk->sk_allocation, 0);
1392                                 if (page == NULL) {
1393                                         err = -ENOMEM;
1394                                         goto error;
1395                                 }
1396                                 sk->sk_sndmsg_page = page;
1397                                 sk->sk_sndmsg_off = 0;
1398
1399                                 skb_fill_page_desc(skb, i, page, 0, 0);
1400                                 frag = &skb_shinfo(skb)->frags[i];
1401                         } else {
1402                                 err = -EMSGSIZE;
1403                                 goto error;
1404                         }
1405                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1406                                 err = -EFAULT;
1407                                 goto error;
1408                         }
1409                         sk->sk_sndmsg_off += copy;
1410                         frag->size += copy;
1411                         skb->len += copy;
1412                         skb->data_len += copy;
1413                         skb->truesize += copy;
1414                         atomic_add(copy, &sk->sk_wmem_alloc);
1415                 }
1416                 offset += copy;
1417                 length -= copy;
1418         }
1419         return 0;
1420 error:
1421         inet->cork.length -= length;
1422         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1423         return err;
1424 }
1425
1426 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1427 {
1428         if (np->cork.opt) {
1429                 kfree(np->cork.opt->dst0opt);
1430                 kfree(np->cork.opt->dst1opt);
1431                 kfree(np->cork.opt->hopopt);
1432                 kfree(np->cork.opt->srcrt);
1433                 kfree(np->cork.opt);
1434                 np->cork.opt = NULL;
1435         }
1436
1437         if (inet->cork.dst) {
1438                 dst_release(inet->cork.dst);
1439                 inet->cork.dst = NULL;
1440                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1441         }
1442         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1443 }
1444
1445 int ip6_push_pending_frames(struct sock *sk)
1446 {
1447         struct sk_buff *skb, *tmp_skb;
1448         struct sk_buff **tail_skb;
1449         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1450         struct inet_sock *inet = inet_sk(sk);
1451         struct ipv6_pinfo *np = inet6_sk(sk);
1452         struct net *net = sock_net(sk);
1453         struct ipv6hdr *hdr;
1454         struct ipv6_txoptions *opt = np->cork.opt;
1455         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1456         struct flowi *fl = &inet->cork.fl;
1457         unsigned char proto = fl->proto;
1458         int err = 0;
1459
1460         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1461                 goto out;
1462         tail_skb = &(skb_shinfo(skb)->frag_list);
1463
1464         /* move skb->data to ip header from ext header */
1465         if (skb->data < skb_network_header(skb))
1466                 __skb_pull(skb, skb_network_offset(skb));
1467         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1468                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1469                 *tail_skb = tmp_skb;
1470                 tail_skb = &(tmp_skb->next);
1471                 skb->len += tmp_skb->len;
1472                 skb->data_len += tmp_skb->len;
1473                 skb->truesize += tmp_skb->truesize;
1474                 tmp_skb->destructor = NULL;
1475                 tmp_skb->sk = NULL;
1476         }
1477
1478         /* Allow local fragmentation. */
1479         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1480                 skb->local_df = 1;
1481
1482         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1483         __skb_pull(skb, skb_network_header_len(skb));
1484         if (opt && opt->opt_flen)
1485                 ipv6_push_frag_opts(skb, opt, &proto);
1486         if (opt && opt->opt_nflen)
1487                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1488
1489         skb_push(skb, sizeof(struct ipv6hdr));
1490         skb_reset_network_header(skb);
1491         hdr = ipv6_hdr(skb);
1492
1493         *(__be32*)hdr = fl->fl6_flowlabel |
1494                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1495
1496         hdr->hop_limit = np->cork.hop_limit;
1497         hdr->nexthdr = proto;
1498         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1499         ipv6_addr_copy(&hdr->daddr, final_dst);
1500
1501         skb->priority = sk->sk_priority;
1502         skb->mark = sk->sk_mark;
1503
1504         skb_dst_set(skb, dst_clone(&rt->u.dst));
1505         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1506         if (proto == IPPROTO_ICMPV6) {
1507                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1508
1509                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1510                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1511         }
1512
1513         err = ip6_local_out(skb);
1514         if (err) {
1515                 if (err > 0)
1516                         err = net_xmit_errno(err);
1517                 if (err)
1518                         goto error;
1519         }
1520
1521 out:
1522         ip6_cork_release(inet, np);
1523         return err;
1524 error:
1525         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1526         goto out;
1527 }
1528
1529 void ip6_flush_pending_frames(struct sock *sk)
1530 {
1531         struct sk_buff *skb;
1532
1533         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1534                 if (skb_dst(skb))
1535                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1536                                       IPSTATS_MIB_OUTDISCARDS);
1537                 kfree_skb(skb);
1538         }
1539
1540         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1541 }