net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 int __ip6_local_out(struct sk_buff *skb)
  61 {
  62         int len;
  63
  64         len = skb->len - sizeof(struct ipv6hdr);
  65         if (len > IPV6_MAXPLEN)
  66                 len = 0;
  67         ipv6_hdr(skb)->payload_len = htons(len);
  68
  69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  70                        dst_output);
  71 }
  72
  73 int ip6_local_out(struct sk_buff *skb)
  74 {
  75         int err;
  76
  77         err = __ip6_local_out(skb);
  78         if (likely(err == 1))
  79                 err = dst_output(skb);
  80
  81         return err;
  82 }
  83 EXPORT_SYMBOL_GPL(ip6_local_out);
  84
  85 static int ip6_output_finish(struct sk_buff *skb)
  86 {
  87         struct dst_entry *dst = skb_dst(skb);
  88
  89         if (dst->hh)
  90                 return neigh_hh_output(dst->hh, skb);
  91         else if (dst->neighbour)
  92                 return dst->neighbour->output(skb);
  93
  94         IP6_INC_STATS_BH(dev_net(dst->dev),
  95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  96         kfree_skb(skb);
  97         return -EINVAL;
  98
  99 }
 100
 101 /* dev_loopback_xmit for use with netfilter. */
 102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 103 {
 104         skb_reset_mac_header(newskb);
 105         __skb_pull(newskb, skb_network_offset(newskb));
 106         newskb->pkt_type = PACKET_LOOPBACK;
 107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 108         WARN_ON(!skb_dst(newskb));
 109
 110         netif_rx(newskb);
 111         return 0;
 112 }
 113
 114
 115 static int ip6_output2(struct sk_buff *skb)
 116 {
 117         struct dst_entry *dst = skb_dst(skb);
 118         struct net_device *dev = dst->dev;
 119
 120         skb->protocol = htons(ETH_P_IPV6);
 121         skb->dev = dev;
 122
 123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 124                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 125                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 126
 127                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 128                     ((mroute6_socket(dev_net(dev)) &&
 129                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 130                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 131                                          &ipv6_hdr(skb)->saddr))) {
 132                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 133
 134                         /* Do not check for IFF_ALLMULTI; multicast routing
 135                            is not supported in any case.
 136                          */
 137                         if (newskb)
 138                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 139                                         NULL, newskb->dev,
 140                                         ip6_dev_loopback_xmit);
 141
 142                         if (ipv6_hdr(skb)->hop_limit == 0) {
 143                                 IP6_INC_STATS(dev_net(dev), idev,
 144                                               IPSTATS_MIB_OUTDISCARDS);
 145                                 kfree_skb(skb);
 146                                 return 0;
 147                         }
 148                 }
 149
 150                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 151                                 skb->len);
 152         }
 153
 154         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 155                        ip6_output_finish);
 156 }
 157
 158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 159 {
 160         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 161
 162         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 163                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 164 }
 165
 166 int ip6_output(struct sk_buff *skb)
 167 {
 168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 169         if (unlikely(idev->cnf.disable_ipv6)) {
 170                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 171                               IPSTATS_MIB_OUTDISCARDS);
 172                 kfree_skb(skb);
 173                 return 0;
 174         }
 175
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb_dst(skb)))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct net *net = sock_net(sk);
 191         struct ipv6_pinfo *np = inet6_sk(sk);
 192         struct in6_addr *first_hop = &fl->fl6_dst;
 193         struct dst_entry *dst = skb_dst(skb);
 194         struct ipv6hdr *hdr;
 195         u8  proto = fl->proto;
 196         int seg_len = skb->len;
 197         int hlimit, tclass;
 198         u32 mtu;
 199
 200         if (opt) {
 201                 unsigned int head_room;
 202
 203                 /* First: exthdrs may take lots of space (~8K for now)
 204                    MAX_HEADER is not enough.
 205                  */
 206                 head_room = opt->opt_nflen + opt->opt_flen;
 207                 seg_len += head_room;
 208                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 209
 210                 if (skb_headroom(skb) < head_room) {
 211                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 212                         if (skb2 == NULL) {
 213                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 214                                               IPSTATS_MIB_OUTDISCARDS);
 215                                 kfree_skb(skb);
 216                                 return -ENOBUFS;
 217                         }
 218                         kfree_skb(skb);
 219                         skb = skb2;
 220                         if (sk)
 221                                 skb_set_owner_w(skb, sk);
 222                 }
 223                 if (opt->opt_flen)
 224                         ipv6_push_frag_opts(skb, opt, &proto);
 225                 if (opt->opt_nflen)
 226                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 227         }
 228
 229         skb_push(skb, sizeof(struct ipv6hdr));
 230         skb_reset_network_header(skb);
 231         hdr = ipv6_hdr(skb);
 232
 233         /* Allow local fragmentation. */
 234         if (ipfragok)
 235                 skb->local_df = 1;
 236
 237         /*
 238          *      Fill in the IPv6 header
 239          */
 240
 241         hlimit = -1;
 242         if (np)
 243                 hlimit = np->hop_limit;
 244         if (hlimit < 0)
 245                 hlimit = ip6_dst_hoplimit(dst);
 246
 247         tclass = -1;
 248         if (np)
 249                 tclass = np->tclass;
 250         if (tclass < 0)
 251                 tclass = 0;
 252
 253         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 254
 255         hdr->payload_len = htons(seg_len);
 256         hdr->nexthdr = proto;
 257         hdr->hop_limit = hlimit;
 258
 259         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 260         ipv6_addr_copy(&hdr->daddr, first_hop);
 261
 262         skb->priority = sk->sk_priority;
 263         skb->mark = sk->sk_mark;
 264
 265         mtu = dst_mtu(dst);
 266         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 267                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 268                               IPSTATS_MIB_OUT, skb->len);
 269                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 270                                 dst_output);
 271         }
 272
 273         if (net_ratelimit())
 274                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 275         skb->dev = dst->dev;
 276         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 277         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 278         kfree_skb(skb);
 279         return -EMSGSIZE;
 280 }
 281
 282 EXPORT_SYMBOL(ip6_xmit);
 283
 284 /*
 285  *      To avoid extra problems ND packets are send through this
 286  *      routine. It's code duplication but I really want to avoid
 287  *      extra checks since ipv6_build_header is used by TCP (which
 288  *      is for us performance critical)
 289  */
 290
 291 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 292                const struct in6_addr *saddr, const struct in6_addr *daddr,
 293                int proto, int len)
 294 {
 295         struct ipv6_pinfo *np = inet6_sk(sk);
 296         struct ipv6hdr *hdr;
 297         int totlen;
 298
 299         skb->protocol = htons(ETH_P_IPV6);
 300         skb->dev = dev;
 301
 302         totlen = len + sizeof(struct ipv6hdr);
 303
 304         skb_reset_network_header(skb);
 305         skb_put(skb, sizeof(struct ipv6hdr));
 306         hdr = ipv6_hdr(skb);
 307
 308         *(__be32*)hdr = htonl(0x60000000);
 309
 310         hdr->payload_len = htons(len);
 311         hdr->nexthdr = proto;
 312         hdr->hop_limit = np->hop_limit;
 313
 314         ipv6_addr_copy(&hdr->saddr, saddr);
 315         ipv6_addr_copy(&hdr->daddr, daddr);
 316
 317         return 0;
 318 }
 319
 320 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 321 {
 322         struct ip6_ra_chain *ra;
 323         struct sock *last = NULL;
 324
 325         read_lock(&ip6_ra_lock);
 326         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 327                 struct sock *sk = ra->sk;
 328                 if (sk && ra->sel == sel &&
 329                     (!sk->sk_bound_dev_if ||
 330                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 331                         if (last) {
 332                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 333                                 if (skb2)
 334                                         rawv6_rcv(last, skb2);
 335                         }
 336                         last = sk;
 337                 }
 338         }
 339
 340         if (last) {
 341                 rawv6_rcv(last, skb);
 342                 read_unlock(&ip6_ra_lock);
 343                 return 1;
 344         }
 345         read_unlock(&ip6_ra_lock);
 346         return 0;
 347 }
 348
 349 static int ip6_forward_proxy_check(struct sk_buff *skb)
 350 {
 351         struct ipv6hdr *hdr = ipv6_hdr(skb);
 352         u8 nexthdr = hdr->nexthdr;
 353         int offset;
 354
 355         if (ipv6_ext_hdr(nexthdr)) {
 356                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 357                 if (offset < 0)
 358                         return 0;
 359         } else
 360                 offset = sizeof(struct ipv6hdr);
 361
 362         if (nexthdr == IPPROTO_ICMPV6) {
 363                 struct icmp6hdr *icmp6;
 364
 365                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 366                                          offset + 1 - skb->data)))
 367                         return 0;
 368
 369                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 370
 371                 switch (icmp6->icmp6_type) {
 372                 case NDISC_ROUTER_SOLICITATION:
 373                 case NDISC_ROUTER_ADVERTISEMENT:
 374                 case NDISC_NEIGHBOUR_SOLICITATION:
 375                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 376                 case NDISC_REDIRECT:
 377                         /* For reaction involving unicast neighbor discovery
 378                          * message destined to the proxied address, pass it to
 379                          * input function.
 380                          */
 381                         return 1;
 382                 default:
 383                         break;
 384                 }
 385         }
 386
 387         /*
 388          * The proxying router can't forward traffic sent to a link-local
 389          * address, so signal the sender and discard the packet. This
 390          * behavior is clarified by the MIPv6 specification.
 391          */
 392         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 393                 dst_link_failure(skb);
 394                 return -1;
 395         }
 396
 397         return 0;
 398 }
 399
 400 static inline int ip6_forward_finish(struct sk_buff *skb)
 401 {
 402         return dst_output(skb);
 403 }
 404
 405 int ip6_forward(struct sk_buff *skb)
 406 {
 407         struct dst_entry *dst = skb_dst(skb);
 408         struct ipv6hdr *hdr = ipv6_hdr(skb);
 409         struct inet6_skb_parm *opt = IP6CB(skb);
 410         struct net *net = dev_net(dst->dev);
 411
 412         if (net->ipv6.devconf_all->forwarding == 0)
 413                 goto error;
 414
 415         if (skb_warn_if_lro(skb))
 416                 goto drop;
 417
 418         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 419                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 420                 goto drop;
 421         }
 422
 423         skb_forward_csum(skb);
 424
 425         /*
 426          *      We DO NOT make any processing on
 427          *      RA packets, pushing them to user level AS IS
 428          *      without ane WARRANTY that application will be able
 429          *      to interpret them. The reason is that we
 430          *      cannot make anything clever here.
 431          *
 432          *      We are not end-node, so that if packet contains
 433          *      AH/ESP, we cannot make anything.
 434          *      Defragmentation also would be mistake, RA packets
 435          *      cannot be fragmented, because there is no warranty
 436          *      that different fragments will go along one path. --ANK
 437          */
 438         if (opt->ra) {
 439                 u8 *ptr = skb_network_header(skb) + opt->ra;
 440                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 441                         return 0;
 442         }
 443
 444         /*
 445          *      check and decrement ttl
 446          */
 447         if (hdr->hop_limit <= 1) {
 448                 /* Force OUTPUT device used as source address */
 449                 skb->dev = dst->dev;
 450                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 451                             0, skb->dev);
 452                 IP6_INC_STATS_BH(net,
 453                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 454
 455                 kfree_skb(skb);
 456                 return -ETIMEDOUT;
 457         }
 458
 459         /* XXX: idev->cnf.proxy_ndp? */
 460         if (net->ipv6.devconf_all->proxy_ndp &&
 461             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 462                 int proxied = ip6_forward_proxy_check(skb);
 463                 if (proxied > 0)
 464                         return ip6_input(skb);
 465                 else if (proxied < 0) {
 466                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 467                                       IPSTATS_MIB_INDISCARDS);
 468                         goto drop;
 469                 }
 470         }
 471
 472         if (!xfrm6_route_forward(skb)) {
 473                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 474                 goto drop;
 475         }
 476         dst = skb_dst(skb);
 477
 478         /* IPv6 specs say nothing about it, but it is clear that we cannot
 479            send redirects to source routed frames.
 480            We don't send redirects to frames decapsulated from IPsec.
 481          */
 482         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 483             !skb_sec_path(skb)) {
 484                 struct in6_addr *target = NULL;
 485                 struct rt6_info *rt;
 486                 struct neighbour *n = dst->neighbour;
 487
 488                 /*
 489                  *      incoming and outgoing devices are the same
 490                  *      send a redirect.
 491                  */
 492
 493                 rt = (struct rt6_info *) dst;
 494                 if ((rt->rt6i_flags & RTF_GATEWAY))
 495                         target = (struct in6_addr*)&n->primary_key;
 496                 else
 497                         target = &hdr->daddr;
 498
 499                 /* Limit redirects both by destination (here)
 500                    and by source (inside ndisc_send_redirect)
 501                  */
 502                 if (xrlim_allow(dst, 1*HZ))
 503                         ndisc_send_redirect(skb, n, target);
 504         } else {
 505                 int addrtype = ipv6_addr_type(&hdr->saddr);
 506
 507                 /* This check is security critical. */
 508                 if (addrtype == IPV6_ADDR_ANY ||
 509                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 510                         goto error;
 511                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 512                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 513                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 514                         goto error;
 515                 }
 516         }
 517
 518         if (skb->len > dst_mtu(dst)) {
 519                 /* Again, force OUTPUT device used as source address */
 520                 skb->dev = dst->dev;
 521                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 522                 IP6_INC_STATS_BH(net,
 523                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 524                 IP6_INC_STATS_BH(net,
 525                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 526                 kfree_skb(skb);
 527                 return -EMSGSIZE;
 528         }
 529
 530         if (skb_cow(skb, dst->dev->hard_header_len)) {
 531                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 532                 goto drop;
 533         }
 534
 535         hdr = ipv6_hdr(skb);
 536
 537         /* Mangling hops number delayed to point after skb COW */
 538
 539         hdr->hop_limit--;
 540
 541         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 542         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 543                        ip6_forward_finish);
 544
 545 error:
 546         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 547 drop:
 548         kfree_skb(skb);
 549         return -EINVAL;
 550 }
 551
 552 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 553 {
 554         to->pkt_type = from->pkt_type;
 555         to->priority = from->priority;
 556         to->protocol = from->protocol;
 557         skb_dst_drop(to);
 558         skb_dst_set(to, dst_clone(skb_dst(from)));
 559         to->dev = from->dev;
 560         to->mark = from->mark;
 561
 562 #ifdef CONFIG_NET_SCHED
 563         to->tc_index = from->tc_index;
 564 #endif
 565         nf_copy(to, from);
 566 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 567     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 568         to->nf_trace = from->nf_trace;
 569 #endif
 570         skb_copy_secmark(to, from);
 571 }
 572
 573 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 574 {
 575         u16 offset = sizeof(struct ipv6hdr);
 576         struct ipv6_opt_hdr *exthdr =
 577                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 578         unsigned int packet_len = skb->tail - skb->network_header;
 579         int found_rhdr = 0;
 580         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 581
 582         while (offset + 1 <= packet_len) {
 583
 584                 switch (**nexthdr) {
 585
 586                 case NEXTHDR_HOP:
 587                         break;
 588                 case NEXTHDR_ROUTING:
 589                         found_rhdr = 1;
 590                         break;
 591                 case NEXTHDR_DEST:
 592 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 593                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 594                                 break;
 595 #endif
 596                         if (found_rhdr)
 597                                 return offset;
 598                         break;
 599                 default :
 600                         return offset;
 601                 }
 602
 603                 offset += ipv6_optlen(exthdr);
 604                 *nexthdr = &exthdr->nexthdr;
 605                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 606                                                  offset);
 607         }
 608
 609         return offset;
 610 }
 611
 612 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 613 {
 614         struct sk_buff *frag;
 615         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 616         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 617         struct ipv6hdr *tmp_hdr;
 618         struct frag_hdr *fh;
 619         unsigned int mtu, hlen, left, len;
 620         __be32 frag_id = 0;
 621         int ptr, offset = 0, err=0;
 622         u8 *prevhdr, nexthdr = 0;
 623         struct net *net = dev_net(skb_dst(skb)->dev);
 624
 625         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 626         nexthdr = *prevhdr;
 627
 628         mtu = ip6_skb_dst_mtu(skb);
 629
 630         /* We must not fragment if the socket is set to force MTU discovery
 631          * or if the skb it not generated by a local socket.  (This last
 632          * check should be redundant, but it's free.)
 633          */
 634         if (!skb->local_df) {
 635                 skb->dev = skb_dst(skb)->dev;
 636                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 637                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 638                               IPSTATS_MIB_FRAGFAILS);
 639                 kfree_skb(skb);
 640                 return -EMSGSIZE;
 641         }
 642
 643         if (np && np->frag_size < mtu) {
 644                 if (np->frag_size)
 645                         mtu = np->frag_size;
 646         }
 647         mtu -= hlen + sizeof(struct frag_hdr);
 648
 649         if (skb_has_frags(skb)) {
 650                 int first_len = skb_pagelen(skb);
 651                 int truesizes = 0;
 652
 653                 if (first_len - hlen > mtu ||
 654                     ((first_len - hlen) & 7) ||
 655                     skb_cloned(skb))
 656                         goto slow_path;
 657
 658                 skb_walk_frags(skb, frag) {
 659                         /* Correct geometry. */
 660                         if (frag->len > mtu ||
 661                             ((frag->len & 7) && frag->next) ||
 662                             skb_headroom(frag) < hlen)
 663                             goto slow_path;
 664
 665                         /* Partially cloned skb? */
 666                         if (skb_shared(frag))
 667                                 goto slow_path;
 668
 669                         BUG_ON(frag->sk);
 670                         if (skb->sk) {
 671                                 frag->sk = skb->sk;
 672                                 frag->destructor = sock_wfree;
 673                                 truesizes += frag->truesize;
 674                         }
 675                 }
 676
 677                 err = 0;
 678                 offset = 0;
 679                 frag = skb_shinfo(skb)->frag_list;
 680                 skb_frag_list_init(skb);
 681                 /* BUILD HEADER */
 682
 683                 *prevhdr = NEXTHDR_FRAGMENT;
 684                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 685                 if (!tmp_hdr) {
 686                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 687                                       IPSTATS_MIB_FRAGFAILS);
 688                         return -ENOMEM;
 689                 }
 690
 691                 __skb_pull(skb, hlen);
 692                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 693                 __skb_push(skb, hlen);
 694                 skb_reset_network_header(skb);
 695                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 696
 697                 ipv6_select_ident(fh);
 698                 fh->nexthdr = nexthdr;
 699                 fh->reserved = 0;
 700                 fh->frag_off = htons(IP6_MF);
 701                 frag_id = fh->identification;
 702
 703                 first_len = skb_pagelen(skb);
 704                 skb->data_len = first_len - skb_headlen(skb);
 705                 skb->truesize -= truesizes;
 706                 skb->len = first_len;
 707                 ipv6_hdr(skb)->payload_len = htons(first_len -
 708                                                    sizeof(struct ipv6hdr));
 709
 710                 dst_hold(&rt->u.dst);
 711
 712                 for (;;) {
 713                         /* Prepare header of the next frame,
 714                          * before previous one went down. */
 715                         if (frag) {
 716                                 frag->ip_summed = CHECKSUM_NONE;
 717                                 skb_reset_transport_header(frag);
 718                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 719                                 __skb_push(frag, hlen);
 720                                 skb_reset_network_header(frag);
 721                                 memcpy(skb_network_header(frag), tmp_hdr,
 722                                        hlen);
 723                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 724                                 fh->nexthdr = nexthdr;
 725                                 fh->reserved = 0;
 726                                 fh->frag_off = htons(offset);
 727                                 if (frag->next != NULL)
 728                                         fh->frag_off |= htons(IP6_MF);
 729                                 fh->identification = frag_id;
 730                                 ipv6_hdr(frag)->payload_len =
 731                                                 htons(frag->len -
 732                                                       sizeof(struct ipv6hdr));
 733                                 ip6_copy_metadata(frag, skb);
 734                         }
 735
 736                         err = output(skb);
 737                         if(!err)
 738                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 739                                               IPSTATS_MIB_FRAGCREATES);
 740
 741                         if (err || !frag)
 742                                 break;
 743
 744                         skb = frag;
 745                         frag = skb->next;
 746                         skb->next = NULL;
 747                 }
 748
 749                 kfree(tmp_hdr);
 750
 751                 if (err == 0) {
 752                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 753                                       IPSTATS_MIB_FRAGOKS);
 754                         dst_release(&rt->u.dst);
 755                         return 0;
 756                 }
 757
 758                 while (frag) {
 759                         skb = frag->next;
 760                         kfree_skb(frag);
 761                         frag = skb;
 762                 }
 763
 764                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 765                               IPSTATS_MIB_FRAGFAILS);
 766                 dst_release(&rt->u.dst);
 767                 return err;
 768         }
 769
 770 slow_path:
 771         left = skb->len - hlen;         /* Space per frame */
 772         ptr = hlen;                     /* Where to start from */
 773
 774         /*
 775          *      Fragment the datagram.
 776          */
 777
 778         *prevhdr = NEXTHDR_FRAGMENT;
 779
 780         /*
 781          *      Keep copying data until we run out.
 782          */
 783         while(left > 0) {
 784                 len = left;
 785                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 786                 if (len > mtu)
 787                         len = mtu;
 788                 /* IF: we are not sending upto and including the packet end
 789                    then align the next start on an eight byte boundary */
 790                 if (len < left) {
 791                         len &= ~7;
 792                 }
 793                 /*
 794                  *      Allocate buffer.
 795                  */
 796
 797                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 798                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 799                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 800                                       IPSTATS_MIB_FRAGFAILS);
 801                         err = -ENOMEM;
 802                         goto fail;
 803                 }
 804
 805                 /*
 806                  *      Set up data on packet
 807                  */
 808
 809                 ip6_copy_metadata(frag, skb);
 810                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 811                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 812                 skb_reset_network_header(frag);
 813                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 814                 frag->transport_header = (frag->network_header + hlen +
 815                                           sizeof(struct frag_hdr));
 816
 817                 /*
 818                  *      Charge the memory for the fragment to any owner
 819                  *      it might possess
 820                  */
 821                 if (skb->sk)
 822                         skb_set_owner_w(frag, skb->sk);
 823
 824                 /*
 825                  *      Copy the packet header into the new buffer.
 826                  */
 827                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 828
 829                 /*
 830                  *      Build fragment header.
 831                  */
 832                 fh->nexthdr = nexthdr;
 833                 fh->reserved = 0;
 834                 if (!frag_id) {
 835                         ipv6_select_ident(fh);
 836                         frag_id = fh->identification;
 837                 } else
 838                         fh->identification = frag_id;
 839
 840                 /*
 841                  *      Copy a block of the IP datagram.
 842                  */
 843                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 844                         BUG();
 845                 left -= len;
 846
 847                 fh->frag_off = htons(offset);
 848                 if (left > 0)
 849                         fh->frag_off |= htons(IP6_MF);
 850                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 851                                                     sizeof(struct ipv6hdr));
 852
 853                 ptr += len;
 854                 offset += len;
 855
 856                 /*
 857                  *      Put this fragment into the sending queue.
 858                  */
 859                 err = output(frag);
 860                 if (err)
 861                         goto fail;
 862
 863                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                               IPSTATS_MIB_FRAGCREATES);
 865         }
 866         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 867                       IPSTATS_MIB_FRAGOKS);
 868         kfree_skb(skb);
 869         return err;
 870
 871 fail:
 872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 873                       IPSTATS_MIB_FRAGFAILS);
 874         kfree_skb(skb);
 875         return err;
 876 }
 877
 878 static inline int ip6_rt_check(struct rt6key *rt_key,
 879                                struct in6_addr *fl_addr,
 880                                struct in6_addr *addr_cache)
 881 {
 882         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 883                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 884 }
 885
 886 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 887                                           struct dst_entry *dst,
 888                                           struct flowi *fl)
 889 {
 890         struct ipv6_pinfo *np = inet6_sk(sk);
 891         struct rt6_info *rt = (struct rt6_info *)dst;
 892
 893         if (!dst)
 894                 goto out;
 895
 896         /* Yes, checking route validity in not connected
 897          * case is not very simple. Take into account,
 898          * that we do not support routing by source, TOS,
 899          * and MSG_DONTROUTE            --ANK (980726)
 900          *
 901          * 1. ip6_rt_check(): If route was host route,
 902          *    check that cached destination is current.
 903          *    If it is network route, we still may
 904          *    check its validity using saved pointer
 905          *    to the last used address: daddr_cache.
 906          *    We do not want to save whole address now,
 907          *    (because main consumer of this service
 908          *    is tcp, which has not this problem),
 909          *    so that the last trick works only on connected
 910          *    sockets.
 911          * 2. oif also should be the same.
 912          */
 913         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 914 #ifdef CONFIG_IPV6_SUBTREES
 915             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 916 #endif
 917             (fl->oif && fl->oif != dst->dev->ifindex)) {
 918                 dst_release(dst);
 919                 dst = NULL;
 920         }
 921
 922 out:
 923         return dst;
 924 }
 925
 926 static int ip6_dst_lookup_tail(struct sock *sk,
 927                                struct dst_entry **dst, struct flowi *fl)
 928 {
 929         int err;
 930         struct net *net = sock_net(sk);
 931
 932         if (*dst == NULL)
 933                 *dst = ip6_route_output(net, sk, fl);
 934
 935         if ((err = (*dst)->error))
 936                 goto out_err_release;
 937
 938         if (ipv6_addr_any(&fl->fl6_src)) {
 939                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 940                                          &fl->fl6_dst,
 941                                          sk ? inet6_sk(sk)->srcprefs : 0,
 942                                          &fl->fl6_src);
 943                 if (err)
 944                         goto out_err_release;
 945         }
 946
 947 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 948         /*
 949          * Here if the dst entry we've looked up
 950          * has a neighbour entry that is in the INCOMPLETE
 951          * state and the src address from the flow is
 952          * marked as OPTIMISTIC, we release the found
 953          * dst entry and replace it instead with the
 954          * dst entry of the nexthop router
 955          */
 956         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 957                 struct inet6_ifaddr *ifp;
 958                 struct flowi fl_gw;
 959                 int redirect;
 960
 961                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 962                                       (*dst)->dev, 1);
 963
 964                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 965                 if (ifp)
 966                         in6_ifa_put(ifp);
 967
 968                 if (redirect) {
 969                         /*
 970                          * We need to get the dst entry for the
 971                          * default router instead
 972                          */
 973                         dst_release(*dst);
 974                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 975                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 976                         *dst = ip6_route_output(net, sk, &fl_gw);
 977                         if ((err = (*dst)->error))
 978                                 goto out_err_release;
 979                 }
 980         }
 981 #endif
 982
 983         return 0;
 984
 985 out_err_release:
 986         if (err == -ENETUNREACH)
 987                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 988         dst_release(*dst);
 989         *dst = NULL;
 990         return err;
 991 }
 992
 993 /**
 994  *      ip6_dst_lookup - perform route lookup on flow
 995  *      @sk: socket which provides route info
 996  *      @dst: pointer to dst_entry * for result
 997  *      @fl: flow to lookup
 998  *
 999  *      This function performs a route lookup on the given flow.
1000  *
1001  *      It returns zero on success, or a standard errno code on error.
1002  */
1003 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1004 {
1005         *dst = NULL;
1006         return ip6_dst_lookup_tail(sk, dst, fl);
1007 }
1008 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1009
1010 /**
1011  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1012  *      @sk: socket which provides the dst cache and route info
1013  *      @dst: pointer to dst_entry * for result
1014  *      @fl: flow to lookup
1015  *
1016  *      This function performs a route lookup on the given flow with the
1017  *      possibility of using the cached route in the socket if it is valid.
1018  *      It will take the socket dst lock when operating on the dst cache.
1019  *      As a result, this function can only be used in process context.
1020  *
1021  *      It returns zero on success, or a standard errno code on error.
1022  */
1023 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1024 {
1025         *dst = NULL;
1026         if (sk) {
1027                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1028                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1029         }
1030
1031         return ip6_dst_lookup_tail(sk, dst, fl);
1032 }
1033 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1034
1035 static inline int ip6_ufo_append_data(struct sock *sk,
1036                         int getfrag(void *from, char *to, int offset, int len,
1037                         int odd, struct sk_buff *skb),
1038                         void *from, int length, int hh_len, int fragheaderlen,
1039                         int transhdrlen, int mtu,unsigned int flags)
1040
1041 {
1042         struct sk_buff *skb;
1043         int err;
1044
1045         /* There is support for UDP large send offload by network
1046          * device, so create one single skb packet containing complete
1047          * udp datagram
1048          */
1049         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1050                 skb = sock_alloc_send_skb(sk,
1051                         hh_len + fragheaderlen + transhdrlen + 20,
1052                         (flags & MSG_DONTWAIT), &err);
1053                 if (skb == NULL)
1054                         return -ENOMEM;
1055
1056                 /* reserve space for Hardware header */
1057                 skb_reserve(skb, hh_len);
1058
1059                 /* create space for UDP/IP header */
1060                 skb_put(skb,fragheaderlen + transhdrlen);
1061
1062                 /* initialize network header pointer */
1063                 skb_reset_network_header(skb);
1064
1065                 /* initialize protocol header pointer */
1066                 skb->transport_header = skb->network_header + fragheaderlen;
1067
1068                 skb->ip_summed = CHECKSUM_PARTIAL;
1069                 skb->csum = 0;
1070                 sk->sk_sndmsg_off = 0;
1071         }
1072
1073         err = skb_append_datato_frags(sk,skb, getfrag, from,
1074                                       (length - transhdrlen));
1075         if (!err) {
1076                 struct frag_hdr fhdr;
1077
1078                 /* Specify the length of each IPv6 datagram fragment.
1079                  * It has to be a multiple of 8.
1080                  */
1081                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1082                                              sizeof(struct frag_hdr)) & ~7;
1083                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1084                 ipv6_select_ident(&fhdr);
1085                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1086                 __skb_queue_tail(&sk->sk_write_queue, skb);
1087
1088                 return 0;
1089         }
1090         /* There is not enough support do UPD LSO,
1091          * so follow normal path
1092          */
1093         kfree_skb(skb);
1094
1095         return err;
1096 }
1097
1098 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1099                                                gfp_t gfp)
1100 {
1101         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102 }
1103
1104 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1105                                                 gfp_t gfp)
1106 {
1107         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1108 }
1109
1110 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1111         int offset, int len, int odd, struct sk_buff *skb),
1112         void *from, int length, int transhdrlen,
1113         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1114         struct rt6_info *rt, unsigned int flags)
1115 {
1116         struct inet_sock *inet = inet_sk(sk);
1117         struct ipv6_pinfo *np = inet6_sk(sk);
1118         struct sk_buff *skb;
1119         unsigned int maxfraglen, fragheaderlen;
1120         int exthdrlen;
1121         int hh_len;
1122         int mtu;
1123         int copy;
1124         int err;
1125         int offset = 0;
1126         int csummode = CHECKSUM_NONE;
1127
1128         if (flags&MSG_PROBE)
1129                 return 0;
1130         if (skb_queue_empty(&sk->sk_write_queue)) {
1131                 /*
1132                  * setup for corking
1133                  */
1134                 if (opt) {
1135                         if (WARN_ON(np->cork.opt))
1136                                 return -EINVAL;
1137
1138                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1139                         if (unlikely(np->cork.opt == NULL))
1140                                 return -ENOBUFS;
1141
1142                         np->cork.opt->tot_len = opt->tot_len;
1143                         np->cork.opt->opt_flen = opt->opt_flen;
1144                         np->cork.opt->opt_nflen = opt->opt_nflen;
1145
1146                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1147                                                             sk->sk_allocation);
1148                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1149                                 return -ENOBUFS;
1150
1151                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1152                                                             sk->sk_allocation);
1153                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1154                                 return -ENOBUFS;
1155
1156                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1157                                                            sk->sk_allocation);
1158                         if (opt->hopopt && !np->cork.opt->hopopt)
1159                                 return -ENOBUFS;
1160
1161                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1162                                                             sk->sk_allocation);
1163                         if (opt->srcrt && !np->cork.opt->srcrt)
1164                                 return -ENOBUFS;
1165
1166                         /* need source address above miyazawa*/
1167                 }
1168                 dst_hold(&rt->u.dst);
1169                 inet->cork.dst = &rt->u.dst;
1170                 inet->cork.fl = *fl;
1171                 np->cork.hop_limit = hlimit;
1172                 np->cork.tclass = tclass;
1173                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1174                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1175                 if (np->frag_size < mtu) {
1176                         if (np->frag_size)
1177                                 mtu = np->frag_size;
1178                 }
1179                 inet->cork.fragsize = mtu;
1180                 if (dst_allfrag(rt->u.dst.path))
1181                         inet->cork.flags |= IPCORK_ALLFRAG;
1182                 inet->cork.length = 0;
1183                 sk->sk_sndmsg_page = NULL;
1184                 sk->sk_sndmsg_off = 0;
1185                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1186                             rt->rt6i_nfheader_len;
1187                 length += exthdrlen;
1188                 transhdrlen += exthdrlen;
1189         } else {
1190                 rt = (struct rt6_info *)inet->cork.dst;
1191                 fl = &inet->cork.fl;
1192                 opt = np->cork.opt;
1193                 transhdrlen = 0;
1194                 exthdrlen = 0;
1195                 mtu = inet->cork.fragsize;
1196         }
1197
1198         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1199
1200         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1201                         (opt ? opt->opt_nflen : 0);
1202         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1203
1204         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1205                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1206                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1207                         return -EMSGSIZE;
1208                 }
1209         }
1210
1211         /*
1212          * Let's try using as much space as possible.
1213          * Use MTU if total length of the message fits into the MTU.
1214          * Otherwise, we need to reserve fragment header and
1215          * fragment alignment (= 8-15 octects, in total).
1216          *
1217          * Note that we may need to "move" the data from the tail of
1218          * of the buffer to the new fragment when we split
1219          * the message.
1220          *
1221          * FIXME: It may be fragmented into multiple chunks
1222          *        at once if non-fragmentable extension headers
1223          *        are too large.
1224          * --yoshfuji
1225          */
1226
1227         inet->cork.length += length;
1228         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1229             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1230
1231                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1232                                           fragheaderlen, transhdrlen, mtu,
1233                                           flags);
1234                 if (err)
1235                         goto error;
1236                 return 0;
1237         }
1238
1239         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1240                 goto alloc_new_skb;
1241
1242         while (length > 0) {
1243                 /* Check if the remaining data fits into current packet. */
1244                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1245                 if (copy < length)
1246                         copy = maxfraglen - skb->len;
1247
1248                 if (copy <= 0) {
1249                         char *data;
1250                         unsigned int datalen;
1251                         unsigned int fraglen;
1252                         unsigned int fraggap;
1253                         unsigned int alloclen;
1254                         struct sk_buff *skb_prev;
1255 alloc_new_skb:
1256                         skb_prev = skb;
1257
1258                         /* There's no room in the current skb */
1259                         if (skb_prev)
1260                                 fraggap = skb_prev->len - maxfraglen;
1261                         else
1262                                 fraggap = 0;
1263
1264                         /*
1265                          * If remaining data exceeds the mtu,
1266                          * we know we need more fragment(s).
1267                          */
1268                         datalen = length + fraggap;
1269                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1270                                 datalen = maxfraglen - fragheaderlen;
1271
1272                         fraglen = datalen + fragheaderlen;
1273                         if ((flags & MSG_MORE) &&
1274                             !(rt->u.dst.dev->features&NETIF_F_SG))
1275                                 alloclen = mtu;
1276                         else
1277                                 alloclen = datalen + fragheaderlen;
1278
1279                         /*
1280                          * The last fragment gets additional space at tail.
1281                          * Note: we overallocate on fragments with MSG_MODE
1282                          * because we have no idea if we're the last one.
1283                          */
1284                         if (datalen == length + fraggap)
1285                                 alloclen += rt->u.dst.trailer_len;
1286
1287                         /*
1288                          * We just reserve space for fragment header.
1289                          * Note: this may be overallocation if the message
1290                          * (without MSG_MORE) fits into the MTU.
1291                          */
1292                         alloclen += sizeof(struct frag_hdr);
1293
1294                         if (transhdrlen) {
1295                                 skb = sock_alloc_send_skb(sk,
1296                                                 alloclen + hh_len,
1297                                                 (flags & MSG_DONTWAIT), &err);
1298                         } else {
1299                                 skb = NULL;
1300                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1301                                     2 * sk->sk_sndbuf)
1302                                         skb = sock_wmalloc(sk,
1303                                                            alloclen + hh_len, 1,
1304                                                            sk->sk_allocation);
1305                                 if (unlikely(skb == NULL))
1306                                         err = -ENOBUFS;
1307                         }
1308                         if (skb == NULL)
1309                                 goto error;
1310                         /*
1311                          *      Fill in the control structures
1312                          */
1313                         skb->ip_summed = csummode;
1314                         skb->csum = 0;
1315                         /* reserve for fragmentation */
1316                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1317
1318                         /*
1319                          *      Find where to start putting bytes
1320                          */
1321                         data = skb_put(skb, fraglen);
1322                         skb_set_network_header(skb, exthdrlen);
1323                         data += fragheaderlen;
1324                         skb->transport_header = (skb->network_header +
1325                                                  fragheaderlen);
1326                         if (fraggap) {
1327                                 skb->csum = skb_copy_and_csum_bits(
1328                                         skb_prev, maxfraglen,
1329                                         data + transhdrlen, fraggap, 0);
1330                                 skb_prev->csum = csum_sub(skb_prev->csum,
1331                                                           skb->csum);
1332                                 data += fraggap;
1333                                 pskb_trim_unique(skb_prev, maxfraglen);
1334                         }
1335                         copy = datalen - transhdrlen - fraggap;
1336                         if (copy < 0) {
1337                                 err = -EINVAL;
1338                                 kfree_skb(skb);
1339                                 goto error;
1340                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1341                                 err = -EFAULT;
1342                                 kfree_skb(skb);
1343                                 goto error;
1344                         }
1345
1346                         offset += copy;
1347                         length -= datalen - fraggap;
1348                         transhdrlen = 0;
1349                         exthdrlen = 0;
1350                         csummode = CHECKSUM_NONE;
1351
1352                         /*
1353                          * Put the packet on the pending queue
1354                          */
1355                         __skb_queue_tail(&sk->sk_write_queue, skb);
1356                         continue;
1357                 }
1358
1359                 if (copy > length)
1360                         copy = length;
1361
1362                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1363                         unsigned int off;
1364
1365                         off = skb->len;
1366                         if (getfrag(from, skb_put(skb, copy),
1367                                                 offset, copy, off, skb) < 0) {
1368                                 __skb_trim(skb, off);
1369                                 err = -EFAULT;
1370                                 goto error;
1371                         }
1372                 } else {
1373                         int i = skb_shinfo(skb)->nr_frags;
1374                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1375                         struct page *page = sk->sk_sndmsg_page;
1376                         int off = sk->sk_sndmsg_off;
1377                         unsigned int left;
1378
1379                         if (page && (left = PAGE_SIZE - off) > 0) {
1380                                 if (copy >= left)
1381                                         copy = left;
1382                                 if (page != frag->page) {
1383                                         if (i == MAX_SKB_FRAGS) {
1384                                                 err = -EMSGSIZE;
1385                                                 goto error;
1386                                         }
1387                                         get_page(page);
1388                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1389                                         frag = &skb_shinfo(skb)->frags[i];
1390                                 }
1391                         } else if(i < MAX_SKB_FRAGS) {
1392                                 if (copy > PAGE_SIZE)
1393                                         copy = PAGE_SIZE;
1394                                 page = alloc_pages(sk->sk_allocation, 0);
1395                                 if (page == NULL) {
1396                                         err = -ENOMEM;
1397                                         goto error;
1398                                 }
1399                                 sk->sk_sndmsg_page = page;
1400                                 sk->sk_sndmsg_off = 0;
1401
1402                                 skb_fill_page_desc(skb, i, page, 0, 0);
1403                                 frag = &skb_shinfo(skb)->frags[i];
1404                         } else {
1405                                 err = -EMSGSIZE;
1406                                 goto error;
1407                         }
1408                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1409                                 err = -EFAULT;
1410                                 goto error;
1411                         }
1412                         sk->sk_sndmsg_off += copy;
1413                         frag->size += copy;
1414                         skb->len += copy;
1415                         skb->data_len += copy;
1416                         skb->truesize += copy;
1417                         atomic_add(copy, &sk->sk_wmem_alloc);
1418                 }
1419                 offset += copy;
1420                 length -= copy;
1421         }
1422         return 0;
1423 error:
1424         inet->cork.length -= length;
1425         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1426         return err;
1427 }
1428
1429 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1430 {
1431         if (np->cork.opt) {
1432                 kfree(np->cork.opt->dst0opt);
1433                 kfree(np->cork.opt->dst1opt);
1434                 kfree(np->cork.opt->hopopt);
1435                 kfree(np->cork.opt->srcrt);
1436                 kfree(np->cork.opt);
1437                 np->cork.opt = NULL;
1438         }
1439
1440         if (inet->cork.dst) {
1441                 dst_release(inet->cork.dst);
1442                 inet->cork.dst = NULL;
1443                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1444         }
1445         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1446 }
1447
1448 int ip6_push_pending_frames(struct sock *sk)
1449 {
1450         struct sk_buff *skb, *tmp_skb;
1451         struct sk_buff **tail_skb;
1452         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1453         struct inet_sock *inet = inet_sk(sk);
1454         struct ipv6_pinfo *np = inet6_sk(sk);
1455         struct net *net = sock_net(sk);
1456         struct ipv6hdr *hdr;
1457         struct ipv6_txoptions *opt = np->cork.opt;
1458         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1459         struct flowi *fl = &inet->cork.fl;
1460         unsigned char proto = fl->proto;
1461         int err = 0;
1462
1463         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1464                 goto out;
1465         tail_skb = &(skb_shinfo(skb)->frag_list);
1466
1467         /* move skb->data to ip header from ext header */
1468         if (skb->data < skb_network_header(skb))
1469                 __skb_pull(skb, skb_network_offset(skb));
1470         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1471                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1472                 *tail_skb = tmp_skb;
1473                 tail_skb = &(tmp_skb->next);
1474                 skb->len += tmp_skb->len;
1475                 skb->data_len += tmp_skb->len;
1476                 skb->truesize += tmp_skb->truesize;
1477                 tmp_skb->destructor = NULL;
1478                 tmp_skb->sk = NULL;
1479         }
1480
1481         /* Allow local fragmentation. */
1482         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1483                 skb->local_df = 1;
1484
1485         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1486         __skb_pull(skb, skb_network_header_len(skb));
1487         if (opt && opt->opt_flen)
1488                 ipv6_push_frag_opts(skb, opt, &proto);
1489         if (opt && opt->opt_nflen)
1490                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1491
1492         skb_push(skb, sizeof(struct ipv6hdr));
1493         skb_reset_network_header(skb);
1494         hdr = ipv6_hdr(skb);
1495
1496         *(__be32*)hdr = fl->fl6_flowlabel |
1497                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1498
1499         hdr->hop_limit = np->cork.hop_limit;
1500         hdr->nexthdr = proto;
1501         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1502         ipv6_addr_copy(&hdr->daddr, final_dst);
1503
1504         skb->priority = sk->sk_priority;
1505         skb->mark = sk->sk_mark;
1506
1507         skb_dst_set(skb, dst_clone(&rt->u.dst));
1508         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1509         if (proto == IPPROTO_ICMPV6) {
1510                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1511
1512                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1513                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1514         }
1515
1516         err = ip6_local_out(skb);
1517         if (err) {
1518                 if (err > 0)
1519                         err = np->recverr ? net_xmit_errno(err) : 0;
1520                 if (err)
1521                         goto error;
1522         }
1523
1524 out:
1525         ip6_cork_release(inet, np);
1526         return err;
1527 error:
1528         goto out;
1529 }
1530
1531 void ip6_flush_pending_frames(struct sock *sk)
1532 {
1533         struct sk_buff *skb;
1534
1535         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1536                 if (skb_dst(skb))
1537                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1538                                       IPSTATS_MIB_OUTDISCARDS);
1539                 kfree_skb(skb);
1540         }
1541
1542         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1543 }