net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         rcu_read_lock();
 139         neigh = dst_get_neighbour(dst);
 140         if (neigh) {
 141                 int res = neigh_output(neigh, skb);
 142
 143                 rcu_read_unlock();
 144                 return res;
 145         }
 146         rcu_read_unlock();
 147         IP6_INC_STATS_BH(dev_net(dst->dev),
 148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 149         kfree_skb(skb);
 150         return -EINVAL;
 151 }
 152
 153 static int ip6_finish_output(struct sk_buff *skb)
 154 {
 155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 156             dst_allfrag(skb_dst(skb)))
 157                 return ip6_fragment(skb, ip6_finish_output2);
 158         else
 159                 return ip6_finish_output2(skb);
 160 }
 161
 162 int ip6_output(struct sk_buff *skb)
 163 {
 164         struct net_device *dev = skb_dst(skb)->dev;
 165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 166         if (unlikely(idev->cnf.disable_ipv6)) {
 167                 IP6_INC_STATS(dev_net(dev), idev,
 168                               IPSTATS_MIB_OUTDISCARDS);
 169                 kfree_skb(skb);
 170                 return 0;
 171         }
 172
 173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 174                             ip6_finish_output,
 175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 176 }
 177
 178 /*
 179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 180  */
 181
 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 183              struct ipv6_txoptions *opt)
 184 {
 185         struct net *net = sock_net(sk);
 186         struct ipv6_pinfo *np = inet6_sk(sk);
 187         struct in6_addr *first_hop = &fl6->daddr;
 188         struct dst_entry *dst = skb_dst(skb);
 189         struct ipv6hdr *hdr;
 190         u8  proto = fl6->flowi6_proto;
 191         int seg_len = skb->len;
 192         int hlimit = -1;
 193         int tclass = 0;
 194         u32 mtu;
 195
 196         if (opt) {
 197                 unsigned int head_room;
 198
 199                 /* First: exthdrs may take lots of space (~8K for now)
 200                    MAX_HEADER is not enough.
 201                  */
 202                 head_room = opt->opt_nflen + opt->opt_flen;
 203                 seg_len += head_room;
 204                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 205
 206                 if (skb_headroom(skb) < head_room) {
 207                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 208                         if (skb2 == NULL) {
 209                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 210                                               IPSTATS_MIB_OUTDISCARDS);
 211                                 kfree_skb(skb);
 212                                 return -ENOBUFS;
 213                         }
 214                         kfree_skb(skb);
 215                         skb = skb2;
 216                         skb_set_owner_w(skb, sk);
 217                 }
 218                 if (opt->opt_flen)
 219                         ipv6_push_frag_opts(skb, opt, &proto);
 220                 if (opt->opt_nflen)
 221                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 222         }
 223
 224         skb_push(skb, sizeof(struct ipv6hdr));
 225         skb_reset_network_header(skb);
 226         hdr = ipv6_hdr(skb);
 227
 228         /*
 229          *      Fill in the IPv6 header
 230          */
 231         if (np) {
 232                 tclass = np->tclass;
 233                 hlimit = np->hop_limit;
 234         }
 235         if (hlimit < 0)
 236                 hlimit = ip6_dst_hoplimit(dst);
 237
 238         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 239
 240         hdr->payload_len = htons(seg_len);
 241         hdr->nexthdr = proto;
 242         hdr->hop_limit = hlimit;
 243
 244         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
 245         ipv6_addr_copy(&hdr->daddr, first_hop);
 246
 247         skb->priority = sk->sk_priority;
 248         skb->mark = sk->sk_mark;
 249
 250         mtu = dst_mtu(dst);
 251         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 252                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 253                               IPSTATS_MIB_OUT, skb->len);
 254                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 255                                dst->dev, dst_output);
 256         }
 257
 258         if (net_ratelimit())
 259                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 260         skb->dev = dst->dev;
 261         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 262         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 263         kfree_skb(skb);
 264         return -EMSGSIZE;
 265 }
 266
 267 EXPORT_SYMBOL(ip6_xmit);
 268
 269 /*
 270  *      To avoid extra problems ND packets are send through this
 271  *      routine. It's code duplication but I really want to avoid
 272  *      extra checks since ipv6_build_header is used by TCP (which
 273  *      is for us performance critical)
 274  */
 275
 276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 277                const struct in6_addr *saddr, const struct in6_addr *daddr,
 278                int proto, int len)
 279 {
 280         struct ipv6_pinfo *np = inet6_sk(sk);
 281         struct ipv6hdr *hdr;
 282
 283         skb->protocol = htons(ETH_P_IPV6);
 284         skb->dev = dev;
 285
 286         skb_reset_network_header(skb);
 287         skb_put(skb, sizeof(struct ipv6hdr));
 288         hdr = ipv6_hdr(skb);
 289
 290         *(__be32*)hdr = htonl(0x60000000);
 291
 292         hdr->payload_len = htons(len);
 293         hdr->nexthdr = proto;
 294         hdr->hop_limit = np->hop_limit;
 295
 296         ipv6_addr_copy(&hdr->saddr, saddr);
 297         ipv6_addr_copy(&hdr->daddr, daddr);
 298
 299         return 0;
 300 }
 301
 302 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 303 {
 304         struct ip6_ra_chain *ra;
 305         struct sock *last = NULL;
 306
 307         read_lock(&ip6_ra_lock);
 308         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 309                 struct sock *sk = ra->sk;
 310                 if (sk && ra->sel == sel &&
 311                     (!sk->sk_bound_dev_if ||
 312                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 313                         if (last) {
 314                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 315                                 if (skb2)
 316                                         rawv6_rcv(last, skb2);
 317                         }
 318                         last = sk;
 319                 }
 320         }
 321
 322         if (last) {
 323                 rawv6_rcv(last, skb);
 324                 read_unlock(&ip6_ra_lock);
 325                 return 1;
 326         }
 327         read_unlock(&ip6_ra_lock);
 328         return 0;
 329 }
 330
 331 static int ip6_forward_proxy_check(struct sk_buff *skb)
 332 {
 333         struct ipv6hdr *hdr = ipv6_hdr(skb);
 334         u8 nexthdr = hdr->nexthdr;
 335         int offset;
 336
 337         if (ipv6_ext_hdr(nexthdr)) {
 338                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 339                 if (offset < 0)
 340                         return 0;
 341         } else
 342                 offset = sizeof(struct ipv6hdr);
 343
 344         if (nexthdr == IPPROTO_ICMPV6) {
 345                 struct icmp6hdr *icmp6;
 346
 347                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 348                                          offset + 1 - skb->data)))
 349                         return 0;
 350
 351                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 352
 353                 switch (icmp6->icmp6_type) {
 354                 case NDISC_ROUTER_SOLICITATION:
 355                 case NDISC_ROUTER_ADVERTISEMENT:
 356                 case NDISC_NEIGHBOUR_SOLICITATION:
 357                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 358                 case NDISC_REDIRECT:
 359                         /* For reaction involving unicast neighbor discovery
 360                          * message destined to the proxied address, pass it to
 361                          * input function.
 362                          */
 363                         return 1;
 364                 default:
 365                         break;
 366                 }
 367         }
 368
 369         /*
 370          * The proxying router can't forward traffic sent to a link-local
 371          * address, so signal the sender and discard the packet. This
 372          * behavior is clarified by the MIPv6 specification.
 373          */
 374         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 375                 dst_link_failure(skb);
 376                 return -1;
 377         }
 378
 379         return 0;
 380 }
 381
 382 static inline int ip6_forward_finish(struct sk_buff *skb)
 383 {
 384         return dst_output(skb);
 385 }
 386
 387 int ip6_forward(struct sk_buff *skb)
 388 {
 389         struct dst_entry *dst = skb_dst(skb);
 390         struct ipv6hdr *hdr = ipv6_hdr(skb);
 391         struct inet6_skb_parm *opt = IP6CB(skb);
 392         struct net *net = dev_net(dst->dev);
 393         struct neighbour *n;
 394         u32 mtu;
 395
 396         if (net->ipv6.devconf_all->forwarding == 0)
 397                 goto error;
 398
 399         if (skb_warn_if_lro(skb))
 400                 goto drop;
 401
 402         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 403                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 404                 goto drop;
 405         }
 406
 407         if (skb->pkt_type != PACKET_HOST)
 408                 goto drop;
 409
 410         skb_forward_csum(skb);
 411
 412         /*
 413          *      We DO NOT make any processing on
 414          *      RA packets, pushing them to user level AS IS
 415          *      without ane WARRANTY that application will be able
 416          *      to interpret them. The reason is that we
 417          *      cannot make anything clever here.
 418          *
 419          *      We are not end-node, so that if packet contains
 420          *      AH/ESP, we cannot make anything.
 421          *      Defragmentation also would be mistake, RA packets
 422          *      cannot be fragmented, because there is no warranty
 423          *      that different fragments will go along one path. --ANK
 424          */
 425         if (opt->ra) {
 426                 u8 *ptr = skb_network_header(skb) + opt->ra;
 427                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 428                         return 0;
 429         }
 430
 431         /*
 432          *      check and decrement ttl
 433          */
 434         if (hdr->hop_limit <= 1) {
 435                 /* Force OUTPUT device used as source address */
 436                 skb->dev = dst->dev;
 437                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 438                 IP6_INC_STATS_BH(net,
 439                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 440
 441                 kfree_skb(skb);
 442                 return -ETIMEDOUT;
 443         }
 444
 445         /* XXX: idev->cnf.proxy_ndp? */
 446         if (net->ipv6.devconf_all->proxy_ndp &&
 447             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 448                 int proxied = ip6_forward_proxy_check(skb);
 449                 if (proxied > 0)
 450                         return ip6_input(skb);
 451                 else if (proxied < 0) {
 452                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 453                                       IPSTATS_MIB_INDISCARDS);
 454                         goto drop;
 455                 }
 456         }
 457
 458         if (!xfrm6_route_forward(skb)) {
 459                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 460                 goto drop;
 461         }
 462         dst = skb_dst(skb);
 463
 464         /* IPv6 specs say nothing about it, but it is clear that we cannot
 465            send redirects to source routed frames.
 466            We don't send redirects to frames decapsulated from IPsec.
 467          */
 468         n = dst_get_neighbour(dst);
 469         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
 470                 struct in6_addr *target = NULL;
 471                 struct rt6_info *rt;
 472
 473                 /*
 474                  *      incoming and outgoing devices are the same
 475                  *      send a redirect.
 476                  */
 477
 478                 rt = (struct rt6_info *) dst;
 479                 if ((rt->rt6i_flags & RTF_GATEWAY))
 480                         target = (struct in6_addr*)&n->primary_key;
 481                 else
 482                         target = &hdr->daddr;
 483
 484                 if (!rt->rt6i_peer)
 485                         rt6_bind_peer(rt, 1);
 486
 487                 /* Limit redirects both by destination (here)
 488                    and by source (inside ndisc_send_redirect)
 489                  */
 490                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
 491                         ndisc_send_redirect(skb, n, target);
 492         } else {
 493                 int addrtype = ipv6_addr_type(&hdr->saddr);
 494
 495                 /* This check is security critical. */
 496                 if (addrtype == IPV6_ADDR_ANY ||
 497                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 498                         goto error;
 499                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 500                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 501                                     ICMPV6_NOT_NEIGHBOUR, 0);
 502                         goto error;
 503                 }
 504         }
 505
 506         mtu = dst_mtu(dst);
 507         if (mtu < IPV6_MIN_MTU)
 508                 mtu = IPV6_MIN_MTU;
 509
 510         if (skb->len > mtu && !skb_is_gso(skb)) {
 511                 /* Again, force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 514                 IP6_INC_STATS_BH(net,
 515                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 516                 IP6_INC_STATS_BH(net,
 517                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 518                 kfree_skb(skb);
 519                 return -EMSGSIZE;
 520         }
 521
 522         if (skb_cow(skb, dst->dev->hard_header_len)) {
 523                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 524                 goto drop;
 525         }
 526
 527         hdr = ipv6_hdr(skb);
 528
 529         /* Mangling hops number delayed to point after skb COW */
 530
 531         hdr->hop_limit--;
 532
 533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 534         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 535                        ip6_forward_finish);
 536
 537 error:
 538         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 539 drop:
 540         kfree_skb(skb);
 541         return -EINVAL;
 542 }
 543
 544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 545 {
 546         to->pkt_type = from->pkt_type;
 547         to->priority = from->priority;
 548         to->protocol = from->protocol;
 549         skb_dst_drop(to);
 550         skb_dst_set(to, dst_clone(skb_dst(from)));
 551         to->dev = from->dev;
 552         to->mark = from->mark;
 553
 554 #ifdef CONFIG_NET_SCHED
 555         to->tc_index = from->tc_index;
 556 #endif
 557         nf_copy(to, from);
 558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 560         to->nf_trace = from->nf_trace;
 561 #endif
 562         skb_copy_secmark(to, from);
 563 }
 564
 565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 566 {
 567         u16 offset = sizeof(struct ipv6hdr);
 568         struct ipv6_opt_hdr *exthdr =
 569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 570         unsigned int packet_len = skb->tail - skb->network_header;
 571         int found_rhdr = 0;
 572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 573
 574         while (offset + 1 <= packet_len) {
 575
 576                 switch (**nexthdr) {
 577
 578                 case NEXTHDR_HOP:
 579                         break;
 580                 case NEXTHDR_ROUTING:
 581                         found_rhdr = 1;
 582                         break;
 583                 case NEXTHDR_DEST:
 584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 586                                 break;
 587 #endif
 588                         if (found_rhdr)
 589                                 return offset;
 590                         break;
 591                 default :
 592                         return offset;
 593                 }
 594
 595                 offset += ipv6_optlen(exthdr);
 596                 *nexthdr = &exthdr->nexthdr;
 597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 598                                                  offset);
 599         }
 600
 601         return offset;
 602 }
 603
 604 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 605 {
 606         static atomic_t ipv6_fragmentation_id;
 607         int old, new;
 608
 609         if (rt) {
 610                 struct inet_peer *peer;
 611
 612                 if (!rt->rt6i_peer)
 613                         rt6_bind_peer(rt, 1);
 614                 peer = rt->rt6i_peer;
 615                 if (peer) {
 616                         fhdr->identification = htonl(inet_getid(peer, 0));
 617                         return;
 618                 }
 619         }
 620         do {
 621                 old = atomic_read(&ipv6_fragmentation_id);
 622                 new = old + 1;
 623                 if (!new)
 624                         new = 1;
 625         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 626         fhdr->identification = htonl(new);
 627 }
 628
 629 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 630 {
 631         struct sk_buff *frag;
 632         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 633         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 634         struct ipv6hdr *tmp_hdr;
 635         struct frag_hdr *fh;
 636         unsigned int mtu, hlen, left, len;
 637         __be32 frag_id = 0;
 638         int ptr, offset = 0, err=0;
 639         u8 *prevhdr, nexthdr = 0;
 640         struct net *net = dev_net(skb_dst(skb)->dev);
 641
 642         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 643         nexthdr = *prevhdr;
 644
 645         mtu = ip6_skb_dst_mtu(skb);
 646
 647         /* We must not fragment if the socket is set to force MTU discovery
 648          * or if the skb it not generated by a local socket.
 649          */
 650         if (!skb->local_df && skb->len > mtu) {
 651                 skb->dev = skb_dst(skb)->dev;
 652                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 653                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 654                               IPSTATS_MIB_FRAGFAILS);
 655                 kfree_skb(skb);
 656                 return -EMSGSIZE;
 657         }
 658
 659         if (np && np->frag_size < mtu) {
 660                 if (np->frag_size)
 661                         mtu = np->frag_size;
 662         }
 663         mtu -= hlen + sizeof(struct frag_hdr);
 664
 665         if (skb_has_frag_list(skb)) {
 666                 int first_len = skb_pagelen(skb);
 667                 struct sk_buff *frag2;
 668
 669                 if (first_len - hlen > mtu ||
 670                     ((first_len - hlen) & 7) ||
 671                     skb_cloned(skb))
 672                         goto slow_path;
 673
 674                 skb_walk_frags(skb, frag) {
 675                         /* Correct geometry. */
 676                         if (frag->len > mtu ||
 677                             ((frag->len & 7) && frag->next) ||
 678                             skb_headroom(frag) < hlen)
 679                                 goto slow_path_clean;
 680
 681                         /* Partially cloned skb? */
 682                         if (skb_shared(frag))
 683                                 goto slow_path_clean;
 684
 685                         BUG_ON(frag->sk);
 686                         if (skb->sk) {
 687                                 frag->sk = skb->sk;
 688                                 frag->destructor = sock_wfree;
 689                         }
 690                         skb->truesize -= frag->truesize;
 691                 }
 692
 693                 err = 0;
 694                 offset = 0;
 695                 frag = skb_shinfo(skb)->frag_list;
 696                 skb_frag_list_init(skb);
 697                 /* BUILD HEADER */
 698
 699                 *prevhdr = NEXTHDR_FRAGMENT;
 700                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 701                 if (!tmp_hdr) {
 702                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 703                                       IPSTATS_MIB_FRAGFAILS);
 704                         return -ENOMEM;
 705                 }
 706
 707                 __skb_pull(skb, hlen);
 708                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 709                 __skb_push(skb, hlen);
 710                 skb_reset_network_header(skb);
 711                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 712
 713                 ipv6_select_ident(fh, rt);
 714                 fh->nexthdr = nexthdr;
 715                 fh->reserved = 0;
 716                 fh->frag_off = htons(IP6_MF);
 717                 frag_id = fh->identification;
 718
 719                 first_len = skb_pagelen(skb);
 720                 skb->data_len = first_len - skb_headlen(skb);
 721                 skb->len = first_len;
 722                 ipv6_hdr(skb)->payload_len = htons(first_len -
 723                                                    sizeof(struct ipv6hdr));
 724
 725                 dst_hold(&rt->dst);
 726
 727                 for (;;) {
 728                         /* Prepare header of the next frame,
 729                          * before previous one went down. */
 730                         if (frag) {
 731                                 frag->ip_summed = CHECKSUM_NONE;
 732                                 skb_reset_transport_header(frag);
 733                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 734                                 __skb_push(frag, hlen);
 735                                 skb_reset_network_header(frag);
 736                                 memcpy(skb_network_header(frag), tmp_hdr,
 737                                        hlen);
 738                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 739                                 fh->nexthdr = nexthdr;
 740                                 fh->reserved = 0;
 741                                 fh->frag_off = htons(offset);
 742                                 if (frag->next != NULL)
 743                                         fh->frag_off |= htons(IP6_MF);
 744                                 fh->identification = frag_id;
 745                                 ipv6_hdr(frag)->payload_len =
 746                                                 htons(frag->len -
 747                                                       sizeof(struct ipv6hdr));
 748                                 ip6_copy_metadata(frag, skb);
 749                         }
 750
 751                         err = output(skb);
 752                         if(!err)
 753                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 754                                               IPSTATS_MIB_FRAGCREATES);
 755
 756                         if (err || !frag)
 757                                 break;
 758
 759                         skb = frag;
 760                         frag = skb->next;
 761                         skb->next = NULL;
 762                 }
 763
 764                 kfree(tmp_hdr);
 765
 766                 if (err == 0) {
 767                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 768                                       IPSTATS_MIB_FRAGOKS);
 769                         dst_release(&rt->dst);
 770                         return 0;
 771                 }
 772
 773                 while (frag) {
 774                         skb = frag->next;
 775                         kfree_skb(frag);
 776                         frag = skb;
 777                 }
 778
 779                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 780                               IPSTATS_MIB_FRAGFAILS);
 781                 dst_release(&rt->dst);
 782                 return err;
 783
 784 slow_path_clean:
 785                 skb_walk_frags(skb, frag2) {
 786                         if (frag2 == frag)
 787                                 break;
 788                         frag2->sk = NULL;
 789                         frag2->destructor = NULL;
 790                         skb->truesize += frag2->truesize;
 791                 }
 792         }
 793
 794 slow_path:
 795         left = skb->len - hlen;         /* Space per frame */
 796         ptr = hlen;                     /* Where to start from */
 797
 798         /*
 799          *      Fragment the datagram.
 800          */
 801
 802         *prevhdr = NEXTHDR_FRAGMENT;
 803
 804         /*
 805          *      Keep copying data until we run out.
 806          */
 807         while(left > 0) {
 808                 len = left;
 809                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 810                 if (len > mtu)
 811                         len = mtu;
 812                 /* IF: we are not sending up to and including the packet end
 813                    then align the next start on an eight byte boundary */
 814                 if (len < left) {
 815                         len &= ~7;
 816                 }
 817                 /*
 818                  *      Allocate buffer.
 819                  */
 820
 821                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 822                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 823                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 824                                       IPSTATS_MIB_FRAGFAILS);
 825                         err = -ENOMEM;
 826                         goto fail;
 827                 }
 828
 829                 /*
 830                  *      Set up data on packet
 831                  */
 832
 833                 ip6_copy_metadata(frag, skb);
 834                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 835                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 836                 skb_reset_network_header(frag);
 837                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 838                 frag->transport_header = (frag->network_header + hlen +
 839                                           sizeof(struct frag_hdr));
 840
 841                 /*
 842                  *      Charge the memory for the fragment to any owner
 843                  *      it might possess
 844                  */
 845                 if (skb->sk)
 846                         skb_set_owner_w(frag, skb->sk);
 847
 848                 /*
 849                  *      Copy the packet header into the new buffer.
 850                  */
 851                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 852
 853                 /*
 854                  *      Build fragment header.
 855                  */
 856                 fh->nexthdr = nexthdr;
 857                 fh->reserved = 0;
 858                 if (!frag_id) {
 859                         ipv6_select_ident(fh, rt);
 860                         frag_id = fh->identification;
 861                 } else
 862                         fh->identification = frag_id;
 863
 864                 /*
 865                  *      Copy a block of the IP datagram.
 866                  */
 867                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 868                         BUG();
 869                 left -= len;
 870
 871                 fh->frag_off = htons(offset);
 872                 if (left > 0)
 873                         fh->frag_off |= htons(IP6_MF);
 874                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 875                                                     sizeof(struct ipv6hdr));
 876
 877                 ptr += len;
 878                 offset += len;
 879
 880                 /*
 881                  *      Put this fragment into the sending queue.
 882                  */
 883                 err = output(frag);
 884                 if (err)
 885                         goto fail;
 886
 887                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 888                               IPSTATS_MIB_FRAGCREATES);
 889         }
 890         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 891                       IPSTATS_MIB_FRAGOKS);
 892         kfree_skb(skb);
 893         return err;
 894
 895 fail:
 896         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 897                       IPSTATS_MIB_FRAGFAILS);
 898         kfree_skb(skb);
 899         return err;
 900 }
 901
 902 static inline int ip6_rt_check(const struct rt6key *rt_key,
 903                                const struct in6_addr *fl_addr,
 904                                const struct in6_addr *addr_cache)
 905 {
 906         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 907                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 908 }
 909
 910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 911                                           struct dst_entry *dst,
 912                                           const struct flowi6 *fl6)
 913 {
 914         struct ipv6_pinfo *np = inet6_sk(sk);
 915         struct rt6_info *rt = (struct rt6_info *)dst;
 916
 917         if (!dst)
 918                 goto out;
 919
 920         /* Yes, checking route validity in not connected
 921          * case is not very simple. Take into account,
 922          * that we do not support routing by source, TOS,
 923          * and MSG_DONTROUTE            --ANK (980726)
 924          *
 925          * 1. ip6_rt_check(): If route was host route,
 926          *    check that cached destination is current.
 927          *    If it is network route, we still may
 928          *    check its validity using saved pointer
 929          *    to the last used address: daddr_cache.
 930          *    We do not want to save whole address now,
 931          *    (because main consumer of this service
 932          *    is tcp, which has not this problem),
 933          *    so that the last trick works only on connected
 934          *    sockets.
 935          * 2. oif also should be the same.
 936          */
 937         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 938 #ifdef CONFIG_IPV6_SUBTREES
 939             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 940 #endif
 941             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 942                 dst_release(dst);
 943                 dst = NULL;
 944         }
 945
 946 out:
 947         return dst;
 948 }
 949
 950 static int ip6_dst_lookup_tail(struct sock *sk,
 951                                struct dst_entry **dst, struct flowi6 *fl6)
 952 {
 953         struct net *net = sock_net(sk);
 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 955         struct neighbour *n;
 956 #endif
 957         int err;
 958
 959         if (*dst == NULL)
 960                 *dst = ip6_route_output(net, sk, fl6);
 961
 962         if ((err = (*dst)->error))
 963                 goto out_err_release;
 964
 965         if (ipv6_addr_any(&fl6->saddr)) {
 966                 struct rt6_info *rt = (struct rt6_info *) *dst;
 967                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 968                                           sk ? inet6_sk(sk)->srcprefs : 0,
 969                                           &fl6->saddr);
 970                 if (err)
 971                         goto out_err_release;
 972         }
 973
 974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 975         /*
 976          * Here if the dst entry we've looked up
 977          * has a neighbour entry that is in the INCOMPLETE
 978          * state and the src address from the flow is
 979          * marked as OPTIMISTIC, we release the found
 980          * dst entry and replace it instead with the
 981          * dst entry of the nexthop router
 982          */
 983         rcu_read_lock();
 984         n = dst_get_neighbour(*dst);
 985         if (n && !(n->nud_state & NUD_VALID)) {
 986                 struct inet6_ifaddr *ifp;
 987                 struct flowi6 fl_gw6;
 988                 int redirect;
 989
 990                 rcu_read_unlock();
 991                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 992                                       (*dst)->dev, 1);
 993
 994                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 995                 if (ifp)
 996                         in6_ifa_put(ifp);
 997
 998                 if (redirect) {
 999                         /*
1000                          * We need to get the dst entry for the
1001                          * default router instead
1002                          */
1003                         dst_release(*dst);
1004                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006                         *dst = ip6_route_output(net, sk, &fl_gw6);
1007                         if ((err = (*dst)->error))
1008                                 goto out_err_release;
1009                 }
1010         } else {
1011                 rcu_read_unlock();
1012         }
1013 #endif
1014
1015         return 0;
1016
1017 out_err_release:
1018         if (err == -ENETUNREACH)
1019                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020         dst_release(*dst);
1021         *dst = NULL;
1022         return err;
1023 }
1024
1025 /**
1026  *      ip6_dst_lookup - perform route lookup on flow
1027  *      @sk: socket which provides route info
1028  *      @dst: pointer to dst_entry * for result
1029  *      @fl6: flow to lookup
1030  *
1031  *      This function performs a route lookup on the given flow.
1032  *
1033  *      It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037         *dst = NULL;
1038         return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041
1042 /**
1043  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *      @sk: socket which provides route info
1045  *      @fl6: flow to lookup
1046  *      @final_dst: final destination address for ipsec lookup
1047  *      @can_sleep: we are in a sleepable context
1048  *
1049  *      This function performs a route lookup on the given flow.
1050  *
1051  *      It returns a valid dst pointer on success, or a pointer encoded
1052  *      error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055                                       const struct in6_addr *final_dst,
1056                                       bool can_sleep)
1057 {
1058         struct dst_entry *dst = NULL;
1059         int err;
1060
1061         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062         if (err)
1063                 return ERR_PTR(err);
1064         if (final_dst)
1065                 ipv6_addr_copy(&fl6->daddr, final_dst);
1066         if (can_sleep)
1067                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068
1069         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072
1073 /**
1074  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *      @sk: socket which provides the dst cache and route info
1076  *      @fl6: flow to lookup
1077  *      @final_dst: final destination address for ipsec lookup
1078  *      @can_sleep: we are in a sleepable context
1079  *
1080  *      This function performs a route lookup on the given flow with the
1081  *      possibility of using the cached route in the socket if it is valid.
1082  *      It will take the socket dst lock when operating on the dst cache.
1083  *      As a result, this function can only be used in process context.
1084  *
1085  *      It returns a valid dst pointer on success, or a pointer encoded
1086  *      error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089                                          const struct in6_addr *final_dst,
1090                                          bool can_sleep)
1091 {
1092         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093         int err;
1094
1095         dst = ip6_sk_dst_check(sk, dst, fl6);
1096
1097         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098         if (err)
1099                 return ERR_PTR(err);
1100         if (final_dst)
1101                 ipv6_addr_copy(&fl6->daddr, final_dst);
1102         if (can_sleep)
1103                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104
1105         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110                         int getfrag(void *from, char *to, int offset, int len,
1111                         int odd, struct sk_buff *skb),
1112                         void *from, int length, int hh_len, int fragheaderlen,
1113                         int transhdrlen, int mtu,unsigned int flags,
1114                         struct rt6_info *rt)
1115
1116 {
1117         struct sk_buff *skb;
1118         int err;
1119
1120         /* There is support for UDP large send offload by network
1121          * device, so create one single skb packet containing complete
1122          * udp datagram
1123          */
1124         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (skb == NULL)
1129                         return -ENOMEM;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb,fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_reset_network_header(skb);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->ip_summed = CHECKSUM_PARTIAL;
1144                 skb->csum = 0;
1145         }
1146
1147         err = skb_append_datato_frags(sk,skb, getfrag, from,
1148                                       (length - transhdrlen));
1149         if (!err) {
1150                 struct frag_hdr fhdr;
1151
1152                 /* Specify the length of each IPv6 datagram fragment.
1153                  * It has to be a multiple of 8.
1154                  */
1155                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156                                              sizeof(struct frag_hdr)) & ~7;
1157                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158                 ipv6_select_ident(&fhdr, rt);
1159                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160                 __skb_queue_tail(&sk->sk_write_queue, skb);
1161
1162                 return 0;
1163         }
1164         /* There is not enough support do UPD LSO,
1165          * so follow normal path
1166          */
1167         kfree_skb(skb);
1168
1169         return err;
1170 }
1171
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173                                                gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179                                                 gfp_t gfp)
1180 {
1181         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185         int offset, int len, int odd, struct sk_buff *skb),
1186         void *from, int length, int transhdrlen,
1187         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188         struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190         struct inet_sock *inet = inet_sk(sk);
1191         struct ipv6_pinfo *np = inet6_sk(sk);
1192         struct inet_cork *cork;
1193         struct sk_buff *skb;
1194         unsigned int maxfraglen, fragheaderlen;
1195         int exthdrlen;
1196         int dst_exthdrlen;
1197         int hh_len;
1198         int mtu;
1199         int copy;
1200         int err;
1201         int offset = 0;
1202         int csummode = CHECKSUM_NONE;
1203         __u8 tx_flags = 0;
1204
1205         if (flags&MSG_PROBE)
1206                 return 0;
1207         cork = &inet->cork.base;
1208         if (skb_queue_empty(&sk->sk_write_queue)) {
1209                 /*
1210                  * setup for corking
1211                  */
1212                 if (opt) {
1213                         if (WARN_ON(np->cork.opt))
1214                                 return -EINVAL;
1215
1216                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1217                         if (unlikely(np->cork.opt == NULL))
1218                                 return -ENOBUFS;
1219
1220                         np->cork.opt->tot_len = opt->tot_len;
1221                         np->cork.opt->opt_flen = opt->opt_flen;
1222                         np->cork.opt->opt_nflen = opt->opt_nflen;
1223
1224                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1225                                                             sk->sk_allocation);
1226                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1227                                 return -ENOBUFS;
1228
1229                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1230                                                             sk->sk_allocation);
1231                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1232                                 return -ENOBUFS;
1233
1234                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1235                                                            sk->sk_allocation);
1236                         if (opt->hopopt && !np->cork.opt->hopopt)
1237                                 return -ENOBUFS;
1238
1239                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1240                                                             sk->sk_allocation);
1241                         if (opt->srcrt && !np->cork.opt->srcrt)
1242                                 return -ENOBUFS;
1243
1244                         /* need source address above miyazawa*/
1245                 }
1246                 dst_hold(&rt->dst);
1247                 cork->dst = &rt->dst;
1248                 inet->cork.fl.u.ip6 = *fl6;
1249                 np->cork.hop_limit = hlimit;
1250                 np->cork.tclass = tclass;
1251                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1252                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1253                 if (np->frag_size < mtu) {
1254                         if (np->frag_size)
1255                                 mtu = np->frag_size;
1256                 }
1257                 cork->fragsize = mtu;
1258                 if (dst_allfrag(rt->dst.path))
1259                         cork->flags |= IPCORK_ALLFRAG;
1260                 cork->length = 0;
1261                 sk->sk_sndmsg_page = NULL;
1262                 sk->sk_sndmsg_off = 0;
1263                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1264                 length += exthdrlen;
1265                 transhdrlen += exthdrlen;
1266                 dst_exthdrlen = rt->dst.header_len;
1267         } else {
1268                 rt = (struct rt6_info *)cork->dst;
1269                 fl6 = &inet->cork.fl.u.ip6;
1270                 opt = np->cork.opt;
1271                 transhdrlen = 0;
1272                 exthdrlen = 0;
1273                 dst_exthdrlen = 0;
1274                 mtu = cork->fragsize;
1275         }
1276
1277         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1278
1279         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1280                         (opt ? opt->opt_nflen : 0);
1281         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1282
1283         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1284                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1285                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1286                         return -EMSGSIZE;
1287                 }
1288         }
1289
1290         /* For UDP, check if TX timestamp is enabled */
1291         if (sk->sk_type == SOCK_DGRAM) {
1292                 err = sock_tx_timestamp(sk, &tx_flags);
1293                 if (err)
1294                         goto error;
1295         }
1296
1297         /*
1298          * Let's try using as much space as possible.
1299          * Use MTU if total length of the message fits into the MTU.
1300          * Otherwise, we need to reserve fragment header and
1301          * fragment alignment (= 8-15 octects, in total).
1302          *
1303          * Note that we may need to "move" the data from the tail of
1304          * of the buffer to the new fragment when we split
1305          * the message.
1306          *
1307          * FIXME: It may be fragmented into multiple chunks
1308          *        at once if non-fragmentable extension headers
1309          *        are too large.
1310          * --yoshfuji
1311          */
1312
1313         cork->length += length;
1314         if (length > mtu) {
1315                 int proto = sk->sk_protocol;
1316                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1317                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1318                         return -EMSGSIZE;
1319                 }
1320
1321                 if (proto == IPPROTO_UDP &&
1322                     (rt->dst.dev->features & NETIF_F_UFO)) {
1323
1324                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1325                                                   hh_len, fragheaderlen,
1326                                                   transhdrlen, mtu, flags, rt);
1327                         if (err)
1328                                 goto error;
1329                         return 0;
1330                 }
1331         }
1332
1333         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1334                 goto alloc_new_skb;
1335
1336         while (length > 0) {
1337                 /* Check if the remaining data fits into current packet. */
1338                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1339                 if (copy < length)
1340                         copy = maxfraglen - skb->len;
1341
1342                 if (copy <= 0) {
1343                         char *data;
1344                         unsigned int datalen;
1345                         unsigned int fraglen;
1346                         unsigned int fraggap;
1347                         unsigned int alloclen;
1348                         struct sk_buff *skb_prev;
1349 alloc_new_skb:
1350                         skb_prev = skb;
1351
1352                         /* There's no room in the current skb */
1353                         if (skb_prev)
1354                                 fraggap = skb_prev->len - maxfraglen;
1355                         else
1356                                 fraggap = 0;
1357
1358                         /*
1359                          * If remaining data exceeds the mtu,
1360                          * we know we need more fragment(s).
1361                          */
1362                         datalen = length + fraggap;
1363                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1364                                 datalen = maxfraglen - fragheaderlen;
1365
1366                         fraglen = datalen + fragheaderlen;
1367                         if ((flags & MSG_MORE) &&
1368                             !(rt->dst.dev->features&NETIF_F_SG))
1369                                 alloclen = mtu;
1370                         else
1371                                 alloclen = datalen + fragheaderlen;
1372
1373                         alloclen += dst_exthdrlen;
1374
1375                         /*
1376                          * The last fragment gets additional space at tail.
1377                          * Note: we overallocate on fragments with MSG_MODE
1378                          * because we have no idea if we're the last one.
1379                          */
1380                         if (datalen == length + fraggap)
1381                                 alloclen += rt->dst.trailer_len;
1382
1383                         /*
1384                          * We just reserve space for fragment header.
1385                          * Note: this may be overallocation if the message
1386                          * (without MSG_MORE) fits into the MTU.
1387                          */
1388                         alloclen += sizeof(struct frag_hdr);
1389
1390                         if (transhdrlen) {
1391                                 skb = sock_alloc_send_skb(sk,
1392                                                 alloclen + hh_len,
1393                                                 (flags & MSG_DONTWAIT), &err);
1394                         } else {
1395                                 skb = NULL;
1396                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1397                                     2 * sk->sk_sndbuf)
1398                                         skb = sock_wmalloc(sk,
1399                                                            alloclen + hh_len, 1,
1400                                                            sk->sk_allocation);
1401                                 if (unlikely(skb == NULL))
1402                                         err = -ENOBUFS;
1403                                 else {
1404                                         /* Only the initial fragment
1405                                          * is time stamped.
1406                                          */
1407                                         tx_flags = 0;
1408                                 }
1409                         }
1410                         if (skb == NULL)
1411                                 goto error;
1412                         /*
1413                          *      Fill in the control structures
1414                          */
1415                         skb->ip_summed = csummode;
1416                         skb->csum = 0;
1417                         /* reserve for fragmentation */
1418                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1419
1420                         if (sk->sk_type == SOCK_DGRAM)
1421                                 skb_shinfo(skb)->tx_flags = tx_flags;
1422
1423                         /*
1424                          *      Find where to start putting bytes
1425                          */
1426                         data = skb_put(skb, fraglen + dst_exthdrlen);
1427                         skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1428                         data += fragheaderlen + dst_exthdrlen;
1429                         skb->transport_header = (skb->network_header +
1430                                                  fragheaderlen);
1431                         if (fraggap) {
1432                                 skb->csum = skb_copy_and_csum_bits(
1433                                         skb_prev, maxfraglen,
1434                                         data + transhdrlen, fraggap, 0);
1435                                 skb_prev->csum = csum_sub(skb_prev->csum,
1436                                                           skb->csum);
1437                                 data += fraggap;
1438                                 pskb_trim_unique(skb_prev, maxfraglen);
1439                         }
1440                         copy = datalen - transhdrlen - fraggap;
1441
1442                         if (copy < 0) {
1443                                 err = -EINVAL;
1444                                 kfree_skb(skb);
1445                                 goto error;
1446                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1447                                 err = -EFAULT;
1448                                 kfree_skb(skb);
1449                                 goto error;
1450                         }
1451
1452                         offset += copy;
1453                         length -= datalen - fraggap;
1454                         transhdrlen = 0;
1455                         exthdrlen = 0;
1456                         dst_exthdrlen = 0;
1457                         csummode = CHECKSUM_NONE;
1458
1459                         /*
1460                          * Put the packet on the pending queue
1461                          */
1462                         __skb_queue_tail(&sk->sk_write_queue, skb);
1463                         continue;
1464                 }
1465
1466                 if (copy > length)
1467                         copy = length;
1468
1469                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1470                         unsigned int off;
1471
1472                         off = skb->len;
1473                         if (getfrag(from, skb_put(skb, copy),
1474                                                 offset, copy, off, skb) < 0) {
1475                                 __skb_trim(skb, off);
1476                                 err = -EFAULT;
1477                                 goto error;
1478                         }
1479                 } else {
1480                         int i = skb_shinfo(skb)->nr_frags;
1481                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1482                         struct page *page = sk->sk_sndmsg_page;
1483                         int off = sk->sk_sndmsg_off;
1484                         unsigned int left;
1485
1486                         if (page && (left = PAGE_SIZE - off) > 0) {
1487                                 if (copy >= left)
1488                                         copy = left;
1489                                 if (page != skb_frag_page(frag)) {
1490                                         if (i == MAX_SKB_FRAGS) {
1491                                                 err = -EMSGSIZE;
1492                                                 goto error;
1493                                         }
1494                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1495                                         skb_frag_ref(skb, i);
1496                                         frag = &skb_shinfo(skb)->frags[i];
1497                                 }
1498                         } else if(i < MAX_SKB_FRAGS) {
1499                                 if (copy > PAGE_SIZE)
1500                                         copy = PAGE_SIZE;
1501                                 page = alloc_pages(sk->sk_allocation, 0);
1502                                 if (page == NULL) {
1503                                         err = -ENOMEM;
1504                                         goto error;
1505                                 }
1506                                 sk->sk_sndmsg_page = page;
1507                                 sk->sk_sndmsg_off = 0;
1508
1509                                 skb_fill_page_desc(skb, i, page, 0, 0);
1510                                 frag = &skb_shinfo(skb)->frags[i];
1511                         } else {
1512                                 err = -EMSGSIZE;
1513                                 goto error;
1514                         }
1515                         if (getfrag(from, skb_frag_address(frag)+frag->size,
1516                                     offset, copy, skb->len, skb) < 0) {
1517                                 err = -EFAULT;
1518                                 goto error;
1519                         }
1520                         sk->sk_sndmsg_off += copy;
1521                         frag->size += copy;
1522                         skb->len += copy;
1523                         skb->data_len += copy;
1524                         skb->truesize += copy;
1525                         atomic_add(copy, &sk->sk_wmem_alloc);
1526                 }
1527                 offset += copy;
1528                 length -= copy;
1529         }
1530         return 0;
1531 error:
1532         cork->length -= length;
1533         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1534         return err;
1535 }
1536
1537 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1538 {
1539         if (np->cork.opt) {
1540                 kfree(np->cork.opt->dst0opt);
1541                 kfree(np->cork.opt->dst1opt);
1542                 kfree(np->cork.opt->hopopt);
1543                 kfree(np->cork.opt->srcrt);
1544                 kfree(np->cork.opt);
1545                 np->cork.opt = NULL;
1546         }
1547
1548         if (inet->cork.base.dst) {
1549                 dst_release(inet->cork.base.dst);
1550                 inet->cork.base.dst = NULL;
1551                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1552         }
1553         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1554 }
1555
1556 int ip6_push_pending_frames(struct sock *sk)
1557 {
1558         struct sk_buff *skb, *tmp_skb;
1559         struct sk_buff **tail_skb;
1560         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1561         struct inet_sock *inet = inet_sk(sk);
1562         struct ipv6_pinfo *np = inet6_sk(sk);
1563         struct net *net = sock_net(sk);
1564         struct ipv6hdr *hdr;
1565         struct ipv6_txoptions *opt = np->cork.opt;
1566         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1567         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1568         unsigned char proto = fl6->flowi6_proto;
1569         int err = 0;
1570
1571         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1572                 goto out;
1573         tail_skb = &(skb_shinfo(skb)->frag_list);
1574
1575         /* move skb->data to ip header from ext header */
1576         if (skb->data < skb_network_header(skb))
1577                 __skb_pull(skb, skb_network_offset(skb));
1578         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1579                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1580                 *tail_skb = tmp_skb;
1581                 tail_skb = &(tmp_skb->next);
1582                 skb->len += tmp_skb->len;
1583                 skb->data_len += tmp_skb->len;
1584                 skb->truesize += tmp_skb->truesize;
1585                 tmp_skb->destructor = NULL;
1586                 tmp_skb->sk = NULL;
1587         }
1588
1589         /* Allow local fragmentation. */
1590         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1591                 skb->local_df = 1;
1592
1593         ipv6_addr_copy(final_dst, &fl6->daddr);
1594         __skb_pull(skb, skb_network_header_len(skb));
1595         if (opt && opt->opt_flen)
1596                 ipv6_push_frag_opts(skb, opt, &proto);
1597         if (opt && opt->opt_nflen)
1598                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1599
1600         skb_push(skb, sizeof(struct ipv6hdr));
1601         skb_reset_network_header(skb);
1602         hdr = ipv6_hdr(skb);
1603
1604         *(__be32*)hdr = fl6->flowlabel |
1605                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1606
1607         hdr->hop_limit = np->cork.hop_limit;
1608         hdr->nexthdr = proto;
1609         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1610         ipv6_addr_copy(&hdr->daddr, final_dst);
1611
1612         skb->priority = sk->sk_priority;
1613         skb->mark = sk->sk_mark;
1614
1615         skb_dst_set(skb, dst_clone(&rt->dst));
1616         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1617         if (proto == IPPROTO_ICMPV6) {
1618                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1619
1620                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1621                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1622         }
1623
1624         err = ip6_local_out(skb);
1625         if (err) {
1626                 if (err > 0)
1627                         err = net_xmit_errno(err);
1628                 if (err)
1629                         goto error;
1630         }
1631
1632 out:
1633         ip6_cork_release(inet, np);
1634         return err;
1635 error:
1636         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1637         goto out;
1638 }
1639
1640 void ip6_flush_pending_frames(struct sock *sk)
1641 {
1642         struct sk_buff *skb;
1643
1644         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1645                 if (skb_dst(skb))
1646                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1647                                       IPSTATS_MIB_OUTDISCARDS);
1648                 kfree_skb(skb);
1649         }
1650
1651         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1652 }