net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103
 104         skb->protocol = htons(ETH_P_IPV6);
 105         skb->dev = dev;
 106
 107         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 108                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 109
 110                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 111                     ((mroute6_socket(dev_net(dev), skb) &&
 112                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 113                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 114                                          &ipv6_hdr(skb)->saddr))) {
 115                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 116
 117                         /* Do not check for IFF_ALLMULTI; multicast routing
 118                            is not supported in any case.
 119                          */
 120                         if (newskb)
 121                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 122                                         newskb, NULL, newskb->dev,
 123                                         ip6_dev_loopback_xmit);
 124
 125                         if (ipv6_hdr(skb)->hop_limit == 0) {
 126                                 IP6_INC_STATS(dev_net(dev), idev,
 127                                               IPSTATS_MIB_OUTDISCARDS);
 128                                 kfree_skb(skb);
 129                                 return 0;
 130                         }
 131                 }
 132
 133                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 134                                 skb->len);
 135         }
 136
 137         if (dst->hh)
 138                 return neigh_hh_output(dst->hh, skb);
 139         else if (dst->neighbour)
 140                 return dst->neighbour->output(skb);
 141
 142         IP6_INC_STATS_BH(dev_net(dst->dev),
 143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 144         kfree_skb(skb);
 145         return -EINVAL;
 146 }
 147
 148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 149 {
 150         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 151
 152         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 153                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 154 }
 155
 156 static int ip6_finish_output(struct sk_buff *skb)
 157 {
 158         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 159             dst_allfrag(skb_dst(skb)))
 160                 return ip6_fragment(skb, ip6_finish_output2);
 161         else
 162                 return ip6_finish_output2(skb);
 163 }
 164
 165 int ip6_output(struct sk_buff *skb)
 166 {
 167         struct net_device *dev = skb_dst(skb)->dev;
 168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 169         if (unlikely(idev->cnf.disable_ipv6)) {
 170                 IP6_INC_STATS(dev_net(dev), idev,
 171                               IPSTATS_MIB_OUTDISCARDS);
 172                 kfree_skb(skb);
 173                 return 0;
 174         }
 175
 176         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 177                             ip6_finish_output,
 178                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 179 }
 180
 181 /*
 182  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 183  */
 184
 185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 186              struct ipv6_txoptions *opt)
 187 {
 188         struct net *net = sock_net(sk);
 189         struct ipv6_pinfo *np = inet6_sk(sk);
 190         struct in6_addr *first_hop = &fl->fl6_dst;
 191         struct dst_entry *dst = skb_dst(skb);
 192         struct ipv6hdr *hdr;
 193         u8  proto = fl->proto;
 194         int seg_len = skb->len;
 195         int hlimit = -1;
 196         int tclass = 0;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         skb_set_owner_w(skb, sk);
 220                 }
 221                 if (opt->opt_flen)
 222                         ipv6_push_frag_opts(skb, opt, &proto);
 223                 if (opt->opt_nflen)
 224                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 225         }
 226
 227         skb_push(skb, sizeof(struct ipv6hdr));
 228         skb_reset_network_header(skb);
 229         hdr = ipv6_hdr(skb);
 230
 231         /*
 232          *      Fill in the IPv6 header
 233          */
 234         if (np) {
 235                 tclass = np->tclass;
 236                 hlimit = np->hop_limit;
 237         }
 238         if (hlimit < 0)
 239                 hlimit = ip6_dst_hoplimit(dst);
 240
 241         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 242
 243         hdr->payload_len = htons(seg_len);
 244         hdr->nexthdr = proto;
 245         hdr->hop_limit = hlimit;
 246
 247         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 248         ipv6_addr_copy(&hdr->daddr, first_hop);
 249
 250         skb->priority = sk->sk_priority;
 251         skb->mark = sk->sk_mark;
 252
 253         mtu = dst_mtu(dst);
 254         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 255                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 256                               IPSTATS_MIB_OUT, skb->len);
 257                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 258                                dst->dev, dst_output);
 259         }
 260
 261         if (net_ratelimit())
 262                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 263         skb->dev = dst->dev;
 264         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 265         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 266         kfree_skb(skb);
 267         return -EMSGSIZE;
 268 }
 269
 270 EXPORT_SYMBOL(ip6_xmit);
 271
 272 /*
 273  *      To avoid extra problems ND packets are send through this
 274  *      routine. It's code duplication but I really want to avoid
 275  *      extra checks since ipv6_build_header is used by TCP (which
 276  *      is for us performance critical)
 277  */
 278
 279 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 280                const struct in6_addr *saddr, const struct in6_addr *daddr,
 281                int proto, int len)
 282 {
 283         struct ipv6_pinfo *np = inet6_sk(sk);
 284         struct ipv6hdr *hdr;
 285         int totlen;
 286
 287         skb->protocol = htons(ETH_P_IPV6);
 288         skb->dev = dev;
 289
 290         totlen = len + sizeof(struct ipv6hdr);
 291
 292         skb_reset_network_header(skb);
 293         skb_put(skb, sizeof(struct ipv6hdr));
 294         hdr = ipv6_hdr(skb);
 295
 296         *(__be32*)hdr = htonl(0x60000000);
 297
 298         hdr->payload_len = htons(len);
 299         hdr->nexthdr = proto;
 300         hdr->hop_limit = np->hop_limit;
 301
 302         ipv6_addr_copy(&hdr->saddr, saddr);
 303         ipv6_addr_copy(&hdr->daddr, daddr);
 304
 305         return 0;
 306 }
 307
 308 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 309 {
 310         struct ip6_ra_chain *ra;
 311         struct sock *last = NULL;
 312
 313         read_lock(&ip6_ra_lock);
 314         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 315                 struct sock *sk = ra->sk;
 316                 if (sk && ra->sel == sel &&
 317                     (!sk->sk_bound_dev_if ||
 318                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 319                         if (last) {
 320                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 321                                 if (skb2)
 322                                         rawv6_rcv(last, skb2);
 323                         }
 324                         last = sk;
 325                 }
 326         }
 327
 328         if (last) {
 329                 rawv6_rcv(last, skb);
 330                 read_unlock(&ip6_ra_lock);
 331                 return 1;
 332         }
 333         read_unlock(&ip6_ra_lock);
 334         return 0;
 335 }
 336
 337 static int ip6_forward_proxy_check(struct sk_buff *skb)
 338 {
 339         struct ipv6hdr *hdr = ipv6_hdr(skb);
 340         u8 nexthdr = hdr->nexthdr;
 341         int offset;
 342
 343         if (ipv6_ext_hdr(nexthdr)) {
 344                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 345                 if (offset < 0)
 346                         return 0;
 347         } else
 348                 offset = sizeof(struct ipv6hdr);
 349
 350         if (nexthdr == IPPROTO_ICMPV6) {
 351                 struct icmp6hdr *icmp6;
 352
 353                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 354                                          offset + 1 - skb->data)))
 355                         return 0;
 356
 357                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 358
 359                 switch (icmp6->icmp6_type) {
 360                 case NDISC_ROUTER_SOLICITATION:
 361                 case NDISC_ROUTER_ADVERTISEMENT:
 362                 case NDISC_NEIGHBOUR_SOLICITATION:
 363                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 364                 case NDISC_REDIRECT:
 365                         /* For reaction involving unicast neighbor discovery
 366                          * message destined to the proxied address, pass it to
 367                          * input function.
 368                          */
 369                         return 1;
 370                 default:
 371                         break;
 372                 }
 373         }
 374
 375         /*
 376          * The proxying router can't forward traffic sent to a link-local
 377          * address, so signal the sender and discard the packet. This
 378          * behavior is clarified by the MIPv6 specification.
 379          */
 380         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 381                 dst_link_failure(skb);
 382                 return -1;
 383         }
 384
 385         return 0;
 386 }
 387
 388 static inline int ip6_forward_finish(struct sk_buff *skb)
 389 {
 390         return dst_output(skb);
 391 }
 392
 393 int ip6_forward(struct sk_buff *skb)
 394 {
 395         struct dst_entry *dst = skb_dst(skb);
 396         struct ipv6hdr *hdr = ipv6_hdr(skb);
 397         struct inet6_skb_parm *opt = IP6CB(skb);
 398         struct net *net = dev_net(dst->dev);
 399         u32 mtu;
 400
 401         if (net->ipv6.devconf_all->forwarding == 0)
 402                 goto error;
 403
 404         if (skb_warn_if_lro(skb))
 405                 goto drop;
 406
 407         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 408                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 409                 goto drop;
 410         }
 411
 412         skb_forward_csum(skb);
 413
 414         /*
 415          *      We DO NOT make any processing on
 416          *      RA packets, pushing them to user level AS IS
 417          *      without ane WARRANTY that application will be able
 418          *      to interpret them. The reason is that we
 419          *      cannot make anything clever here.
 420          *
 421          *      We are not end-node, so that if packet contains
 422          *      AH/ESP, we cannot make anything.
 423          *      Defragmentation also would be mistake, RA packets
 424          *      cannot be fragmented, because there is no warranty
 425          *      that different fragments will go along one path. --ANK
 426          */
 427         if (opt->ra) {
 428                 u8 *ptr = skb_network_header(skb) + opt->ra;
 429                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 430                         return 0;
 431         }
 432
 433         /*
 434          *      check and decrement ttl
 435          */
 436         if (hdr->hop_limit <= 1) {
 437                 /* Force OUTPUT device used as source address */
 438                 skb->dev = dst->dev;
 439                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 440                 IP6_INC_STATS_BH(net,
 441                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 442
 443                 kfree_skb(skb);
 444                 return -ETIMEDOUT;
 445         }
 446
 447         /* XXX: idev->cnf.proxy_ndp? */
 448         if (net->ipv6.devconf_all->proxy_ndp &&
 449             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 450                 int proxied = ip6_forward_proxy_check(skb);
 451                 if (proxied > 0)
 452                         return ip6_input(skb);
 453                 else if (proxied < 0) {
 454                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 455                                       IPSTATS_MIB_INDISCARDS);
 456                         goto drop;
 457                 }
 458         }
 459
 460         if (!xfrm6_route_forward(skb)) {
 461                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 462                 goto drop;
 463         }
 464         dst = skb_dst(skb);
 465
 466         /* IPv6 specs say nothing about it, but it is clear that we cannot
 467            send redirects to source routed frames.
 468            We don't send redirects to frames decapsulated from IPsec.
 469          */
 470         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 471             !skb_sec_path(skb)) {
 472                 struct in6_addr *target = NULL;
 473                 struct rt6_info *rt;
 474                 struct neighbour *n = dst->neighbour;
 475
 476                 /*
 477                  *      incoming and outgoing devices are the same
 478                  *      send a redirect.
 479                  */
 480
 481                 rt = (struct rt6_info *) dst;
 482                 if ((rt->rt6i_flags & RTF_GATEWAY))
 483                         target = (struct in6_addr*)&n->primary_key;
 484                 else
 485                         target = &hdr->daddr;
 486
 487                 /* Limit redirects both by destination (here)
 488                    and by source (inside ndisc_send_redirect)
 489                  */
 490                 if (xrlim_allow(dst, 1*HZ))
 491                         ndisc_send_redirect(skb, n, target);
 492         } else {
 493                 int addrtype = ipv6_addr_type(&hdr->saddr);
 494
 495                 /* This check is security critical. */
 496                 if (addrtype == IPV6_ADDR_ANY ||
 497                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 498                         goto error;
 499                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 500                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 501                                     ICMPV6_NOT_NEIGHBOUR, 0);
 502                         goto error;
 503                 }
 504         }
 505
 506         mtu = dst_mtu(dst);
 507         if (mtu < IPV6_MIN_MTU)
 508                 mtu = IPV6_MIN_MTU;
 509
 510         if (skb->len > mtu && !skb_is_gso(skb)) {
 511                 /* Again, force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 514                 IP6_INC_STATS_BH(net,
 515                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 516                 IP6_INC_STATS_BH(net,
 517                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 518                 kfree_skb(skb);
 519                 return -EMSGSIZE;
 520         }
 521
 522         if (skb_cow(skb, dst->dev->hard_header_len)) {
 523                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 524                 goto drop;
 525         }
 526
 527         hdr = ipv6_hdr(skb);
 528
 529         /* Mangling hops number delayed to point after skb COW */
 530
 531         hdr->hop_limit--;
 532
 533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 534         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 535                        ip6_forward_finish);
 536
 537 error:
 538         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 539 drop:
 540         kfree_skb(skb);
 541         return -EINVAL;
 542 }
 543
 544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 545 {
 546         to->pkt_type = from->pkt_type;
 547         to->priority = from->priority;
 548         to->protocol = from->protocol;
 549         skb_dst_drop(to);
 550         skb_dst_set(to, dst_clone(skb_dst(from)));
 551         to->dev = from->dev;
 552         to->mark = from->mark;
 553
 554 #ifdef CONFIG_NET_SCHED
 555         to->tc_index = from->tc_index;
 556 #endif
 557         nf_copy(to, from);
 558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 560         to->nf_trace = from->nf_trace;
 561 #endif
 562         skb_copy_secmark(to, from);
 563 }
 564
 565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 566 {
 567         u16 offset = sizeof(struct ipv6hdr);
 568         struct ipv6_opt_hdr *exthdr =
 569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 570         unsigned int packet_len = skb->tail - skb->network_header;
 571         int found_rhdr = 0;
 572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 573
 574         while (offset + 1 <= packet_len) {
 575
 576                 switch (**nexthdr) {
 577
 578                 case NEXTHDR_HOP:
 579                         break;
 580                 case NEXTHDR_ROUTING:
 581                         found_rhdr = 1;
 582                         break;
 583                 case NEXTHDR_DEST:
 584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 586                                 break;
 587 #endif
 588                         if (found_rhdr)
 589                                 return offset;
 590                         break;
 591                 default :
 592                         return offset;
 593                 }
 594
 595                 offset += ipv6_optlen(exthdr);
 596                 *nexthdr = &exthdr->nexthdr;
 597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 598                                                  offset);
 599         }
 600
 601         return offset;
 602 }
 603
 604 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 605 {
 606         struct sk_buff *frag;
 607         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 608         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 609         struct ipv6hdr *tmp_hdr;
 610         struct frag_hdr *fh;
 611         unsigned int mtu, hlen, left, len;
 612         __be32 frag_id = 0;
 613         int ptr, offset = 0, err=0;
 614         u8 *prevhdr, nexthdr = 0;
 615         struct net *net = dev_net(skb_dst(skb)->dev);
 616
 617         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 618         nexthdr = *prevhdr;
 619
 620         mtu = ip6_skb_dst_mtu(skb);
 621
 622         /* We must not fragment if the socket is set to force MTU discovery
 623          * or if the skb it not generated by a local socket.
 624          */
 625         if (!skb->local_df && skb->len > mtu) {
 626                 skb->dev = skb_dst(skb)->dev;
 627                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 628                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 629                               IPSTATS_MIB_FRAGFAILS);
 630                 kfree_skb(skb);
 631                 return -EMSGSIZE;
 632         }
 633
 634         if (np && np->frag_size < mtu) {
 635                 if (np->frag_size)
 636                         mtu = np->frag_size;
 637         }
 638         mtu -= hlen + sizeof(struct frag_hdr);
 639
 640         if (skb_has_frags(skb)) {
 641                 int first_len = skb_pagelen(skb);
 642                 int truesizes = 0;
 643
 644                 if (first_len - hlen > mtu ||
 645                     ((first_len - hlen) & 7) ||
 646                     skb_cloned(skb))
 647                         goto slow_path;
 648
 649                 skb_walk_frags(skb, frag) {
 650                         /* Correct geometry. */
 651                         if (frag->len > mtu ||
 652                             ((frag->len & 7) && frag->next) ||
 653                             skb_headroom(frag) < hlen)
 654                             goto slow_path;
 655
 656                         /* Partially cloned skb? */
 657                         if (skb_shared(frag))
 658                                 goto slow_path;
 659
 660                         BUG_ON(frag->sk);
 661                         if (skb->sk) {
 662                                 frag->sk = skb->sk;
 663                                 frag->destructor = sock_wfree;
 664                                 truesizes += frag->truesize;
 665                         }
 666                 }
 667
 668                 err = 0;
 669                 offset = 0;
 670                 frag = skb_shinfo(skb)->frag_list;
 671                 skb_frag_list_init(skb);
 672                 /* BUILD HEADER */
 673
 674                 *prevhdr = NEXTHDR_FRAGMENT;
 675                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 676                 if (!tmp_hdr) {
 677                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 678                                       IPSTATS_MIB_FRAGFAILS);
 679                         return -ENOMEM;
 680                 }
 681
 682                 __skb_pull(skb, hlen);
 683                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 684                 __skb_push(skb, hlen);
 685                 skb_reset_network_header(skb);
 686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                 ipv6_select_ident(fh);
 689                 fh->nexthdr = nexthdr;
 690                 fh->reserved = 0;
 691                 fh->frag_off = htons(IP6_MF);
 692                 frag_id = fh->identification;
 693
 694                 first_len = skb_pagelen(skb);
 695                 skb->data_len = first_len - skb_headlen(skb);
 696                 skb->truesize -= truesizes;
 697                 skb->len = first_len;
 698                 ipv6_hdr(skb)->payload_len = htons(first_len -
 699                                                    sizeof(struct ipv6hdr));
 700
 701                 dst_hold(&rt->dst);
 702
 703                 for (;;) {
 704                         /* Prepare header of the next frame,
 705                          * before previous one went down. */
 706                         if (frag) {
 707                                 frag->ip_summed = CHECKSUM_NONE;
 708                                 skb_reset_transport_header(frag);
 709                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 710                                 __skb_push(frag, hlen);
 711                                 skb_reset_network_header(frag);
 712                                 memcpy(skb_network_header(frag), tmp_hdr,
 713                                        hlen);
 714                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 715                                 fh->nexthdr = nexthdr;
 716                                 fh->reserved = 0;
 717                                 fh->frag_off = htons(offset);
 718                                 if (frag->next != NULL)
 719                                         fh->frag_off |= htons(IP6_MF);
 720                                 fh->identification = frag_id;
 721                                 ipv6_hdr(frag)->payload_len =
 722                                                 htons(frag->len -
 723                                                       sizeof(struct ipv6hdr));
 724                                 ip6_copy_metadata(frag, skb);
 725                         }
 726
 727                         err = output(skb);
 728                         if(!err)
 729                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 730                                               IPSTATS_MIB_FRAGCREATES);
 731
 732                         if (err || !frag)
 733                                 break;
 734
 735                         skb = frag;
 736                         frag = skb->next;
 737                         skb->next = NULL;
 738                 }
 739
 740                 kfree(tmp_hdr);
 741
 742                 if (err == 0) {
 743                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 744                                       IPSTATS_MIB_FRAGOKS);
 745                         dst_release(&rt->dst);
 746                         return 0;
 747                 }
 748
 749                 while (frag) {
 750                         skb = frag->next;
 751                         kfree_skb(frag);
 752                         frag = skb;
 753                 }
 754
 755                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 756                               IPSTATS_MIB_FRAGFAILS);
 757                 dst_release(&rt->dst);
 758                 return err;
 759         }
 760
 761 slow_path:
 762         left = skb->len - hlen;         /* Space per frame */
 763         ptr = hlen;                     /* Where to start from */
 764
 765         /*
 766          *      Fragment the datagram.
 767          */
 768
 769         *prevhdr = NEXTHDR_FRAGMENT;
 770
 771         /*
 772          *      Keep copying data until we run out.
 773          */
 774         while(left > 0) {
 775                 len = left;
 776                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 777                 if (len > mtu)
 778                         len = mtu;
 779                 /* IF: we are not sending upto and including the packet end
 780                    then align the next start on an eight byte boundary */
 781                 if (len < left) {
 782                         len &= ~7;
 783                 }
 784                 /*
 785                  *      Allocate buffer.
 786                  */
 787
 788                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 789                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 790                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 791                                       IPSTATS_MIB_FRAGFAILS);
 792                         err = -ENOMEM;
 793                         goto fail;
 794                 }
 795
 796                 /*
 797                  *      Set up data on packet
 798                  */
 799
 800                 ip6_copy_metadata(frag, skb);
 801                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 802                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 803                 skb_reset_network_header(frag);
 804                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 805                 frag->transport_header = (frag->network_header + hlen +
 806                                           sizeof(struct frag_hdr));
 807
 808                 /*
 809                  *      Charge the memory for the fragment to any owner
 810                  *      it might possess
 811                  */
 812                 if (skb->sk)
 813                         skb_set_owner_w(frag, skb->sk);
 814
 815                 /*
 816                  *      Copy the packet header into the new buffer.
 817                  */
 818                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 819
 820                 /*
 821                  *      Build fragment header.
 822                  */
 823                 fh->nexthdr = nexthdr;
 824                 fh->reserved = 0;
 825                 if (!frag_id) {
 826                         ipv6_select_ident(fh);
 827                         frag_id = fh->identification;
 828                 } else
 829                         fh->identification = frag_id;
 830
 831                 /*
 832                  *      Copy a block of the IP datagram.
 833                  */
 834                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 835                         BUG();
 836                 left -= len;
 837
 838                 fh->frag_off = htons(offset);
 839                 if (left > 0)
 840                         fh->frag_off |= htons(IP6_MF);
 841                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 842                                                     sizeof(struct ipv6hdr));
 843
 844                 ptr += len;
 845                 offset += len;
 846
 847                 /*
 848                  *      Put this fragment into the sending queue.
 849                  */
 850                 err = output(frag);
 851                 if (err)
 852                         goto fail;
 853
 854                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 855                               IPSTATS_MIB_FRAGCREATES);
 856         }
 857         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 858                       IPSTATS_MIB_FRAGOKS);
 859         kfree_skb(skb);
 860         return err;
 861
 862 fail:
 863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                       IPSTATS_MIB_FRAGFAILS);
 865         kfree_skb(skb);
 866         return err;
 867 }
 868
 869 static inline int ip6_rt_check(struct rt6key *rt_key,
 870                                struct in6_addr *fl_addr,
 871                                struct in6_addr *addr_cache)
 872 {
 873         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 874                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 875 }
 876
 877 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 878                                           struct dst_entry *dst,
 879                                           struct flowi *fl)
 880 {
 881         struct ipv6_pinfo *np = inet6_sk(sk);
 882         struct rt6_info *rt = (struct rt6_info *)dst;
 883
 884         if (!dst)
 885                 goto out;
 886
 887         /* Yes, checking route validity in not connected
 888          * case is not very simple. Take into account,
 889          * that we do not support routing by source, TOS,
 890          * and MSG_DONTROUTE            --ANK (980726)
 891          *
 892          * 1. ip6_rt_check(): If route was host route,
 893          *    check that cached destination is current.
 894          *    If it is network route, we still may
 895          *    check its validity using saved pointer
 896          *    to the last used address: daddr_cache.
 897          *    We do not want to save whole address now,
 898          *    (because main consumer of this service
 899          *    is tcp, which has not this problem),
 900          *    so that the last trick works only on connected
 901          *    sockets.
 902          * 2. oif also should be the same.
 903          */
 904         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 905 #ifdef CONFIG_IPV6_SUBTREES
 906             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 907 #endif
 908             (fl->oif && fl->oif != dst->dev->ifindex)) {
 909                 dst_release(dst);
 910                 dst = NULL;
 911         }
 912
 913 out:
 914         return dst;
 915 }
 916
 917 static int ip6_dst_lookup_tail(struct sock *sk,
 918                                struct dst_entry **dst, struct flowi *fl)
 919 {
 920         int err;
 921         struct net *net = sock_net(sk);
 922
 923         if (*dst == NULL)
 924                 *dst = ip6_route_output(net, sk, fl);
 925
 926         if ((err = (*dst)->error))
 927                 goto out_err_release;
 928
 929         if (ipv6_addr_any(&fl->fl6_src)) {
 930                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 931                                          &fl->fl6_dst,
 932                                          sk ? inet6_sk(sk)->srcprefs : 0,
 933                                          &fl->fl6_src);
 934                 if (err)
 935                         goto out_err_release;
 936         }
 937
 938 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 939         /*
 940          * Here if the dst entry we've looked up
 941          * has a neighbour entry that is in the INCOMPLETE
 942          * state and the src address from the flow is
 943          * marked as OPTIMISTIC, we release the found
 944          * dst entry and replace it instead with the
 945          * dst entry of the nexthop router
 946          */
 947         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 948                 struct inet6_ifaddr *ifp;
 949                 struct flowi fl_gw;
 950                 int redirect;
 951
 952                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 953                                       (*dst)->dev, 1);
 954
 955                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 956                 if (ifp)
 957                         in6_ifa_put(ifp);
 958
 959                 if (redirect) {
 960                         /*
 961                          * We need to get the dst entry for the
 962                          * default router instead
 963                          */
 964                         dst_release(*dst);
 965                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 966                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 967                         *dst = ip6_route_output(net, sk, &fl_gw);
 968                         if ((err = (*dst)->error))
 969                                 goto out_err_release;
 970                 }
 971         }
 972 #endif
 973
 974         return 0;
 975
 976 out_err_release:
 977         if (err == -ENETUNREACH)
 978                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 979         dst_release(*dst);
 980         *dst = NULL;
 981         return err;
 982 }
 983
 984 /**
 985  *      ip6_dst_lookup - perform route lookup on flow
 986  *      @sk: socket which provides route info
 987  *      @dst: pointer to dst_entry * for result
 988  *      @fl: flow to lookup
 989  *
 990  *      This function performs a route lookup on the given flow.
 991  *
 992  *      It returns zero on success, or a standard errno code on error.
 993  */
 994 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 995 {
 996         *dst = NULL;
 997         return ip6_dst_lookup_tail(sk, dst, fl);
 998 }
 999 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1000
1001 /**
1002  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1003  *      @sk: socket which provides the dst cache and route info
1004  *      @dst: pointer to dst_entry * for result
1005  *      @fl: flow to lookup
1006  *
1007  *      This function performs a route lookup on the given flow with the
1008  *      possibility of using the cached route in the socket if it is valid.
1009  *      It will take the socket dst lock when operating on the dst cache.
1010  *      As a result, this function can only be used in process context.
1011  *
1012  *      It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016         *dst = NULL;
1017         if (sk) {
1018                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1019                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1020         }
1021
1022         return ip6_dst_lookup_tail(sk, dst, fl);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1025
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027                         int getfrag(void *from, char *to, int offset, int len,
1028                         int odd, struct sk_buff *skb),
1029                         void *from, int length, int hh_len, int fragheaderlen,
1030                         int transhdrlen, int mtu,unsigned int flags)
1031
1032 {
1033         struct sk_buff *skb;
1034         int err;
1035
1036         /* There is support for UDP large send offload by network
1037          * device, so create one single skb packet containing complete
1038          * udp datagram
1039          */
1040         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1041                 skb = sock_alloc_send_skb(sk,
1042                         hh_len + fragheaderlen + transhdrlen + 20,
1043                         (flags & MSG_DONTWAIT), &err);
1044                 if (skb == NULL)
1045                         return -ENOMEM;
1046
1047                 /* reserve space for Hardware header */
1048                 skb_reserve(skb, hh_len);
1049
1050                 /* create space for UDP/IP header */
1051                 skb_put(skb,fragheaderlen + transhdrlen);
1052
1053                 /* initialize network header pointer */
1054                 skb_reset_network_header(skb);
1055
1056                 /* initialize protocol header pointer */
1057                 skb->transport_header = skb->network_header + fragheaderlen;
1058
1059                 skb->ip_summed = CHECKSUM_PARTIAL;
1060                 skb->csum = 0;
1061                 sk->sk_sndmsg_off = 0;
1062         }
1063
1064         err = skb_append_datato_frags(sk,skb, getfrag, from,
1065                                       (length - transhdrlen));
1066         if (!err) {
1067                 struct frag_hdr fhdr;
1068
1069                 /* Specify the length of each IPv6 datagram fragment.
1070                  * It has to be a multiple of 8.
1071                  */
1072                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073                                              sizeof(struct frag_hdr)) & ~7;
1074                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075                 ipv6_select_ident(&fhdr);
1076                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077                 __skb_queue_tail(&sk->sk_write_queue, skb);
1078
1079                 return 0;
1080         }
1081         /* There is not enough support do UPD LSO,
1082          * so follow normal path
1083          */
1084         kfree_skb(skb);
1085
1086         return err;
1087 }
1088
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090                                                gfp_t gfp)
1091 {
1092         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096                                                 gfp_t gfp)
1097 {
1098         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100
1101 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1102         int offset, int len, int odd, struct sk_buff *skb),
1103         void *from, int length, int transhdrlen,
1104         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1105         struct rt6_info *rt, unsigned int flags, int dontfrag)
1106 {
1107         struct inet_sock *inet = inet_sk(sk);
1108         struct ipv6_pinfo *np = inet6_sk(sk);
1109         struct sk_buff *skb;
1110         unsigned int maxfraglen, fragheaderlen;
1111         int exthdrlen;
1112         int hh_len;
1113         int mtu;
1114         int copy;
1115         int err;
1116         int offset = 0;
1117         int csummode = CHECKSUM_NONE;
1118
1119         if (flags&MSG_PROBE)
1120                 return 0;
1121         if (skb_queue_empty(&sk->sk_write_queue)) {
1122                 /*
1123                  * setup for corking
1124                  */
1125                 if (opt) {
1126                         if (WARN_ON(np->cork.opt))
1127                                 return -EINVAL;
1128
1129                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1130                         if (unlikely(np->cork.opt == NULL))
1131                                 return -ENOBUFS;
1132
1133                         np->cork.opt->tot_len = opt->tot_len;
1134                         np->cork.opt->opt_flen = opt->opt_flen;
1135                         np->cork.opt->opt_nflen = opt->opt_nflen;
1136
1137                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1138                                                             sk->sk_allocation);
1139                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1140                                 return -ENOBUFS;
1141
1142                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1143                                                             sk->sk_allocation);
1144                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1145                                 return -ENOBUFS;
1146
1147                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1148                                                            sk->sk_allocation);
1149                         if (opt->hopopt && !np->cork.opt->hopopt)
1150                                 return -ENOBUFS;
1151
1152                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1153                                                             sk->sk_allocation);
1154                         if (opt->srcrt && !np->cork.opt->srcrt)
1155                                 return -ENOBUFS;
1156
1157                         /* need source address above miyazawa*/
1158                 }
1159                 dst_hold(&rt->dst);
1160                 inet->cork.dst = &rt->dst;
1161                 inet->cork.fl = *fl;
1162                 np->cork.hop_limit = hlimit;
1163                 np->cork.tclass = tclass;
1164                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1165                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1166                 if (np->frag_size < mtu) {
1167                         if (np->frag_size)
1168                                 mtu = np->frag_size;
1169                 }
1170                 inet->cork.fragsize = mtu;
1171                 if (dst_allfrag(rt->dst.path))
1172                         inet->cork.flags |= IPCORK_ALLFRAG;
1173                 inet->cork.length = 0;
1174                 sk->sk_sndmsg_page = NULL;
1175                 sk->sk_sndmsg_off = 0;
1176                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1177                             rt->rt6i_nfheader_len;
1178                 length += exthdrlen;
1179                 transhdrlen += exthdrlen;
1180         } else {
1181                 rt = (struct rt6_info *)inet->cork.dst;
1182                 fl = &inet->cork.fl;
1183                 opt = np->cork.opt;
1184                 transhdrlen = 0;
1185                 exthdrlen = 0;
1186                 mtu = inet->cork.fragsize;
1187         }
1188
1189         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1190
1191         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1192                         (opt ? opt->opt_nflen : 0);
1193         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1194
1195         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1196                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1197                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1198                         return -EMSGSIZE;
1199                 }
1200         }
1201
1202         /*
1203          * Let's try using as much space as possible.
1204          * Use MTU if total length of the message fits into the MTU.
1205          * Otherwise, we need to reserve fragment header and
1206          * fragment alignment (= 8-15 octects, in total).
1207          *
1208          * Note that we may need to "move" the data from the tail of
1209          * of the buffer to the new fragment when we split
1210          * the message.
1211          *
1212          * FIXME: It may be fragmented into multiple chunks
1213          *        at once if non-fragmentable extension headers
1214          *        are too large.
1215          * --yoshfuji
1216          */
1217
1218         inet->cork.length += length;
1219         if (length > mtu) {
1220                 int proto = sk->sk_protocol;
1221                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1222                         ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1223                         return -EMSGSIZE;
1224                 }
1225
1226                 if (proto == IPPROTO_UDP &&
1227                     (rt->dst.dev->features & NETIF_F_UFO)) {
1228
1229                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1230                                                   hh_len, fragheaderlen,
1231                                                   transhdrlen, mtu, flags);
1232                         if (err)
1233                                 goto error;
1234                         return 0;
1235                 }
1236         }
1237
1238         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1239                 goto alloc_new_skb;
1240
1241         while (length > 0) {
1242                 /* Check if the remaining data fits into current packet. */
1243                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1244                 if (copy < length)
1245                         copy = maxfraglen - skb->len;
1246
1247                 if (copy <= 0) {
1248                         char *data;
1249                         unsigned int datalen;
1250                         unsigned int fraglen;
1251                         unsigned int fraggap;
1252                         unsigned int alloclen;
1253                         struct sk_buff *skb_prev;
1254 alloc_new_skb:
1255                         skb_prev = skb;
1256
1257                         /* There's no room in the current skb */
1258                         if (skb_prev)
1259                                 fraggap = skb_prev->len - maxfraglen;
1260                         else
1261                                 fraggap = 0;
1262
1263                         /*
1264                          * If remaining data exceeds the mtu,
1265                          * we know we need more fragment(s).
1266                          */
1267                         datalen = length + fraggap;
1268                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1269                                 datalen = maxfraglen - fragheaderlen;
1270
1271                         fraglen = datalen + fragheaderlen;
1272                         if ((flags & MSG_MORE) &&
1273                             !(rt->dst.dev->features&NETIF_F_SG))
1274                                 alloclen = mtu;
1275                         else
1276                                 alloclen = datalen + fragheaderlen;
1277
1278                         /*
1279                          * The last fragment gets additional space at tail.
1280                          * Note: we overallocate on fragments with MSG_MODE
1281                          * because we have no idea if we're the last one.
1282                          */
1283                         if (datalen == length + fraggap)
1284                                 alloclen += rt->dst.trailer_len;
1285
1286                         /*
1287                          * We just reserve space for fragment header.
1288                          * Note: this may be overallocation if the message
1289                          * (without MSG_MORE) fits into the MTU.
1290                          */
1291                         alloclen += sizeof(struct frag_hdr);
1292
1293                         if (transhdrlen) {
1294                                 skb = sock_alloc_send_skb(sk,
1295                                                 alloclen + hh_len,
1296                                                 (flags & MSG_DONTWAIT), &err);
1297                         } else {
1298                                 skb = NULL;
1299                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1300                                     2 * sk->sk_sndbuf)
1301                                         skb = sock_wmalloc(sk,
1302                                                            alloclen + hh_len, 1,
1303                                                            sk->sk_allocation);
1304                                 if (unlikely(skb == NULL))
1305                                         err = -ENOBUFS;
1306                         }
1307                         if (skb == NULL)
1308                                 goto error;
1309                         /*
1310                          *      Fill in the control structures
1311                          */
1312                         skb->ip_summed = csummode;
1313                         skb->csum = 0;
1314                         /* reserve for fragmentation */
1315                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1316
1317                         /*
1318                          *      Find where to start putting bytes
1319                          */
1320                         data = skb_put(skb, fraglen);
1321                         skb_set_network_header(skb, exthdrlen);
1322                         data += fragheaderlen;
1323                         skb->transport_header = (skb->network_header +
1324                                                  fragheaderlen);
1325                         if (fraggap) {
1326                                 skb->csum = skb_copy_and_csum_bits(
1327                                         skb_prev, maxfraglen,
1328                                         data + transhdrlen, fraggap, 0);
1329                                 skb_prev->csum = csum_sub(skb_prev->csum,
1330                                                           skb->csum);
1331                                 data += fraggap;
1332                                 pskb_trim_unique(skb_prev, maxfraglen);
1333                         }
1334                         copy = datalen - transhdrlen - fraggap;
1335                         if (copy < 0) {
1336                                 err = -EINVAL;
1337                                 kfree_skb(skb);
1338                                 goto error;
1339                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1340                                 err = -EFAULT;
1341                                 kfree_skb(skb);
1342                                 goto error;
1343                         }
1344
1345                         offset += copy;
1346                         length -= datalen - fraggap;
1347                         transhdrlen = 0;
1348                         exthdrlen = 0;
1349                         csummode = CHECKSUM_NONE;
1350
1351                         /*
1352                          * Put the packet on the pending queue
1353                          */
1354                         __skb_queue_tail(&sk->sk_write_queue, skb);
1355                         continue;
1356                 }
1357
1358                 if (copy > length)
1359                         copy = length;
1360
1361                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1362                         unsigned int off;
1363
1364                         off = skb->len;
1365                         if (getfrag(from, skb_put(skb, copy),
1366                                                 offset, copy, off, skb) < 0) {
1367                                 __skb_trim(skb, off);
1368                                 err = -EFAULT;
1369                                 goto error;
1370                         }
1371                 } else {
1372                         int i = skb_shinfo(skb)->nr_frags;
1373                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1374                         struct page *page = sk->sk_sndmsg_page;
1375                         int off = sk->sk_sndmsg_off;
1376                         unsigned int left;
1377
1378                         if (page && (left = PAGE_SIZE - off) > 0) {
1379                                 if (copy >= left)
1380                                         copy = left;
1381                                 if (page != frag->page) {
1382                                         if (i == MAX_SKB_FRAGS) {
1383                                                 err = -EMSGSIZE;
1384                                                 goto error;
1385                                         }
1386                                         get_page(page);
1387                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1388                                         frag = &skb_shinfo(skb)->frags[i];
1389                                 }
1390                         } else if(i < MAX_SKB_FRAGS) {
1391                                 if (copy > PAGE_SIZE)
1392                                         copy = PAGE_SIZE;
1393                                 page = alloc_pages(sk->sk_allocation, 0);
1394                                 if (page == NULL) {
1395                                         err = -ENOMEM;
1396                                         goto error;
1397                                 }
1398                                 sk->sk_sndmsg_page = page;
1399                                 sk->sk_sndmsg_off = 0;
1400
1401                                 skb_fill_page_desc(skb, i, page, 0, 0);
1402                                 frag = &skb_shinfo(skb)->frags[i];
1403                         } else {
1404                                 err = -EMSGSIZE;
1405                                 goto error;
1406                         }
1407                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1408                                 err = -EFAULT;
1409                                 goto error;
1410                         }
1411                         sk->sk_sndmsg_off += copy;
1412                         frag->size += copy;
1413                         skb->len += copy;
1414                         skb->data_len += copy;
1415                         skb->truesize += copy;
1416                         atomic_add(copy, &sk->sk_wmem_alloc);
1417                 }
1418                 offset += copy;
1419                 length -= copy;
1420         }
1421         return 0;
1422 error:
1423         inet->cork.length -= length;
1424         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1425         return err;
1426 }
1427
1428 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1429 {
1430         if (np->cork.opt) {
1431                 kfree(np->cork.opt->dst0opt);
1432                 kfree(np->cork.opt->dst1opt);
1433                 kfree(np->cork.opt->hopopt);
1434                 kfree(np->cork.opt->srcrt);
1435                 kfree(np->cork.opt);
1436                 np->cork.opt = NULL;
1437         }
1438
1439         if (inet->cork.dst) {
1440                 dst_release(inet->cork.dst);
1441                 inet->cork.dst = NULL;
1442                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1443         }
1444         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1445 }
1446
1447 int ip6_push_pending_frames(struct sock *sk)
1448 {
1449         struct sk_buff *skb, *tmp_skb;
1450         struct sk_buff **tail_skb;
1451         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1452         struct inet_sock *inet = inet_sk(sk);
1453         struct ipv6_pinfo *np = inet6_sk(sk);
1454         struct net *net = sock_net(sk);
1455         struct ipv6hdr *hdr;
1456         struct ipv6_txoptions *opt = np->cork.opt;
1457         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1458         struct flowi *fl = &inet->cork.fl;
1459         unsigned char proto = fl->proto;
1460         int err = 0;
1461
1462         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1463                 goto out;
1464         tail_skb = &(skb_shinfo(skb)->frag_list);
1465
1466         /* move skb->data to ip header from ext header */
1467         if (skb->data < skb_network_header(skb))
1468                 __skb_pull(skb, skb_network_offset(skb));
1469         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1470                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1471                 *tail_skb = tmp_skb;
1472                 tail_skb = &(tmp_skb->next);
1473                 skb->len += tmp_skb->len;
1474                 skb->data_len += tmp_skb->len;
1475                 skb->truesize += tmp_skb->truesize;
1476                 tmp_skb->destructor = NULL;
1477                 tmp_skb->sk = NULL;
1478         }
1479
1480         /* Allow local fragmentation. */
1481         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1482                 skb->local_df = 1;
1483
1484         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1485         __skb_pull(skb, skb_network_header_len(skb));
1486         if (opt && opt->opt_flen)
1487                 ipv6_push_frag_opts(skb, opt, &proto);
1488         if (opt && opt->opt_nflen)
1489                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1490
1491         skb_push(skb, sizeof(struct ipv6hdr));
1492         skb_reset_network_header(skb);
1493         hdr = ipv6_hdr(skb);
1494
1495         *(__be32*)hdr = fl->fl6_flowlabel |
1496                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1497
1498         hdr->hop_limit = np->cork.hop_limit;
1499         hdr->nexthdr = proto;
1500         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1501         ipv6_addr_copy(&hdr->daddr, final_dst);
1502
1503         skb->priority = sk->sk_priority;
1504         skb->mark = sk->sk_mark;
1505
1506         skb_dst_set(skb, dst_clone(&rt->dst));
1507         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1508         if (proto == IPPROTO_ICMPV6) {
1509                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1510
1511                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1512                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1513         }
1514
1515         err = ip6_local_out(skb);
1516         if (err) {
1517                 if (err > 0)
1518                         err = net_xmit_errno(err);
1519                 if (err)
1520                         goto error;
1521         }
1522
1523 out:
1524         ip6_cork_release(inet, np);
1525         return err;
1526 error:
1527         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1528         goto out;
1529 }
1530
1531 void ip6_flush_pending_frames(struct sock *sk)
1532 {
1533         struct sk_buff *skb;
1534
1535         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1536                 if (skb_dst(skb))
1537                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1538                                       IPSTATS_MIB_OUTDISCARDS);
1539                 kfree_skb(skb);
1540         }
1541
1542         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1543 }