net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 142             dst_allfrag(skb_dst(skb)) ||
 143             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 144                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 145         else
 146                 return ip6_finish_output2(net, sk, skb);
 147 }
 148
 149 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 150 {
 151         struct net_device *dev = skb_dst(skb)->dev;
 152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153
 154         skb->protocol = htons(ETH_P_IPV6);
 155         skb->dev = dev;
 156
 157         if (unlikely(idev->cnf.disable_ipv6)) {
 158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                 kfree_skb(skb);
 160                 return 0;
 161         }
 162
 163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                             net, sk, skb, NULL, dev,
 165                             ip6_finish_output,
 166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167 }
 168
 169 /*
 170  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 171  * Note : socket lock is not held for SYNACK packets, but might be modified
 172  * by calls to skb_set_owner_w() and ipv6_local_error(),
 173  * which are using proper atomic operations or spinlocks.
 174  */
 175 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 176              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 177 {
 178         struct net *net = sock_net(sk);
 179         const struct ipv6_pinfo *np = inet6_sk(sk);
 180         struct in6_addr *first_hop = &fl6->daddr;
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct ipv6hdr *hdr;
 183         u8  proto = fl6->flowi6_proto;
 184         int seg_len = skb->len;
 185         int hlimit = -1;
 186         u32 mtu;
 187
 188         if (opt) {
 189                 unsigned int head_room;
 190
 191                 /* First: exthdrs may take lots of space (~8K for now)
 192                    MAX_HEADER is not enough.
 193                  */
 194                 head_room = opt->opt_nflen + opt->opt_flen;
 195                 seg_len += head_room;
 196                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 197
 198                 if (skb_headroom(skb) < head_room) {
 199                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 200                         if (!skb2) {
 201                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 202                                               IPSTATS_MIB_OUTDISCARDS);
 203                                 kfree_skb(skb);
 204                                 return -ENOBUFS;
 205                         }
 206                         consume_skb(skb);
 207                         skb = skb2;
 208                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 209                          * it is safe to call in our context (socket lock not held)
 210                          */
 211                         skb_set_owner_w(skb, (struct sock *)sk);
 212                 }
 213                 if (opt->opt_flen)
 214                         ipv6_push_frag_opts(skb, opt, &proto);
 215                 if (opt->opt_nflen)
 216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 217                                              &fl6->saddr);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 233                                                      np->autoflowlabel, fl6));
 234
 235         hdr->payload_len = htons(seg_len);
 236         hdr->nexthdr = proto;
 237         hdr->hop_limit = hlimit;
 238
 239         hdr->saddr = fl6->saddr;
 240         hdr->daddr = *first_hop;
 241
 242         skb->protocol = htons(ETH_P_IPV6);
 243         skb->priority = sk->sk_priority;
 244         skb->mark = mark;
 245
 246         mtu = dst_mtu(dst);
 247         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 248                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 249                               IPSTATS_MIB_OUT, skb->len);
 250
 251                 /* if egress device is enslaved to an L3 master device pass the
 252                  * skb to its handler for processing
 253                  */
 254                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 255                 if (unlikely(!skb))
 256                         return 0;
 257
 258                 /* hooks should never assume socket lock is held.
 259                  * we promote our socket to non const
 260                  */
 261                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 262                                net, (struct sock *)sk, skb, NULL, dst->dev,
 263                                dst_output);
 264         }
 265
 266         skb->dev = dst->dev;
 267         /* ipv6_local_error() does not require socket lock,
 268          * we promote our socket to non const
 269          */
 270         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 271
 272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 279 {
 280         struct ip6_ra_chain *ra;
 281         struct sock *last = NULL;
 282
 283         read_lock(&ip6_ra_lock);
 284         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 285                 struct sock *sk = ra->sk;
 286                 if (sk && ra->sel == sel &&
 287                     (!sk->sk_bound_dev_if ||
 288                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 289                         if (last) {
 290                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 291                                 if (skb2)
 292                                         rawv6_rcv(last, skb2);
 293                         }
 294                         last = sk;
 295                 }
 296         }
 297
 298         if (last) {
 299                 rawv6_rcv(last, skb);
 300                 read_unlock(&ip6_ra_lock);
 301                 return 1;
 302         }
 303         read_unlock(&ip6_ra_lock);
 304         return 0;
 305 }
 306
 307 static int ip6_forward_proxy_check(struct sk_buff *skb)
 308 {
 309         struct ipv6hdr *hdr = ipv6_hdr(skb);
 310         u8 nexthdr = hdr->nexthdr;
 311         __be16 frag_off;
 312         int offset;
 313
 314         if (ipv6_ext_hdr(nexthdr)) {
 315                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 316                 if (offset < 0)
 317                         return 0;
 318         } else
 319                 offset = sizeof(struct ipv6hdr);
 320
 321         if (nexthdr == IPPROTO_ICMPV6) {
 322                 struct icmp6hdr *icmp6;
 323
 324                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 325                                          offset + 1 - skb->data)))
 326                         return 0;
 327
 328                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 329
 330                 switch (icmp6->icmp6_type) {
 331                 case NDISC_ROUTER_SOLICITATION:
 332                 case NDISC_ROUTER_ADVERTISEMENT:
 333                 case NDISC_NEIGHBOUR_SOLICITATION:
 334                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 335                 case NDISC_REDIRECT:
 336                         /* For reaction involving unicast neighbor discovery
 337                          * message destined to the proxied address, pass it to
 338                          * input function.
 339                          */
 340                         return 1;
 341                 default:
 342                         break;
 343                 }
 344         }
 345
 346         /*
 347          * The proxying router can't forward traffic sent to a link-local
 348          * address, so signal the sender and discard the packet. This
 349          * behavior is clarified by the MIPv6 specification.
 350          */
 351         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 352                 dst_link_failure(skb);
 353                 return -1;
 354         }
 355
 356         return 0;
 357 }
 358
 359 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 360                                      struct sk_buff *skb)
 361 {
 362         return dst_output(net, sk, skb);
 363 }
 364
 365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 366 {
 367         unsigned int mtu;
 368         struct inet6_dev *idev;
 369
 370         if (dst_metric_locked(dst, RTAX_MTU)) {
 371                 mtu = dst_metric_raw(dst, RTAX_MTU);
 372                 if (mtu)
 373                         return mtu;
 374         }
 375
 376         mtu = IPV6_MIN_MTU;
 377         rcu_read_lock();
 378         idev = __in6_dev_get(dst->dev);
 379         if (idev)
 380                 mtu = idev->cnf.mtu6;
 381         rcu_read_unlock();
 382
 383         return mtu;
 384 }
 385
 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387 {
 388         if (skb->len <= mtu)
 389                 return false;
 390
 391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                 return true;
 394
 395         if (skb->ignore_df)
 396                 return false;
 397
 398         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 399                 return false;
 400
 401         return true;
 402 }
 403
 404 int ip6_forward(struct sk_buff *skb)
 405 {
 406         struct dst_entry *dst = skb_dst(skb);
 407         struct ipv6hdr *hdr = ipv6_hdr(skb);
 408         struct inet6_skb_parm *opt = IP6CB(skb);
 409         struct net *net = dev_net(dst->dev);
 410         u32 mtu;
 411
 412         if (net->ipv6.devconf_all->forwarding == 0)
 413                 goto error;
 414
 415         if (skb->pkt_type != PACKET_HOST)
 416                 goto drop;
 417
 418         if (unlikely(skb->sk))
 419                 goto drop;
 420
 421         if (skb_warn_if_lro(skb))
 422                 goto drop;
 423
 424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 426                                 IPSTATS_MIB_INDISCARDS);
 427                 goto drop;
 428         }
 429
 430         skb_forward_csum(skb);
 431
 432         /*
 433          *      We DO NOT make any processing on
 434          *      RA packets, pushing them to user level AS IS
 435          *      without ane WARRANTY that application will be able
 436          *      to interpret them. The reason is that we
 437          *      cannot make anything clever here.
 438          *
 439          *      We are not end-node, so that if packet contains
 440          *      AH/ESP, we cannot make anything.
 441          *      Defragmentation also would be mistake, RA packets
 442          *      cannot be fragmented, because there is no warranty
 443          *      that different fragments will go along one path. --ANK
 444          */
 445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 458                                 IPSTATS_MIB_INHDRERRORS);
 459
 460                 kfree_skb(skb);
 461                 return -ETIMEDOUT;
 462         }
 463
 464         /* XXX: idev->cnf.proxy_ndp? */
 465         if (net->ipv6.devconf_all->proxy_ndp &&
 466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 467                 int proxied = ip6_forward_proxy_check(skb);
 468                 if (proxied > 0)
 469                         return ip6_input(skb);
 470                 else if (proxied < 0) {
 471                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 472                                         IPSTATS_MIB_INDISCARDS);
 473                         goto drop;
 474                 }
 475         }
 476
 477         if (!xfrm6_route_forward(skb)) {
 478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                 IPSTATS_MIB_INDISCARDS);
 480                 goto drop;
 481         }
 482         dst = skb_dst(skb);
 483
 484         /* IPv6 specs say nothing about it, but it is clear that we cannot
 485            send redirects to source routed frames.
 486            We don't send redirects to frames decapsulated from IPsec.
 487          */
 488         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 489                 struct in6_addr *target = NULL;
 490                 struct inet_peer *peer;
 491                 struct rt6_info *rt;
 492
 493                 /*
 494                  *      incoming and outgoing devices are the same
 495                  *      send a redirect.
 496                  */
 497
 498                 rt = (struct rt6_info *) dst;
 499                 if (rt->rt6i_flags & RTF_GATEWAY)
 500                         target = &rt->rt6i_gateway;
 501                 else
 502                         target = &hdr->daddr;
 503
 504                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 505
 506                 /* Limit redirects both by destination (here)
 507                    and by source (inside ndisc_send_redirect)
 508                  */
 509                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 510                         ndisc_send_redirect(skb, target);
 511                 if (peer)
 512                         inet_putpeer(peer);
 513         } else {
 514                 int addrtype = ipv6_addr_type(&hdr->saddr);
 515
 516                 /* This check is security critical. */
 517                 if (addrtype == IPV6_ADDR_ANY ||
 518                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 519                         goto error;
 520                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 521                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 522                                     ICMPV6_NOT_NEIGHBOUR, 0);
 523                         goto error;
 524                 }
 525         }
 526
 527         mtu = ip6_dst_mtu_forward(dst);
 528         if (mtu < IPV6_MIN_MTU)
 529                 mtu = IPV6_MIN_MTU;
 530
 531         if (ip6_pkt_too_big(skb, mtu)) {
 532                 /* Again, force OUTPUT device used as source address */
 533                 skb->dev = dst->dev;
 534                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                 IPSTATS_MIB_INTOOBIGERRORS);
 537                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 538                                 IPSTATS_MIB_FRAGFAILS);
 539                 kfree_skb(skb);
 540                 return -EMSGSIZE;
 541         }
 542
 543         if (skb_cow(skb, dst->dev->hard_header_len)) {
 544                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 545                                 IPSTATS_MIB_OUTDISCARDS);
 546                 goto drop;
 547         }
 548
 549         hdr = ipv6_hdr(skb);
 550
 551         /* Mangling hops number delayed to point after skb COW */
 552
 553         hdr->hop_limit--;
 554
 555         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 556         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 557         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 558                        net, NULL, skb, skb->dev, dst->dev,
 559                        ip6_forward_finish);
 560
 561 error:
 562         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 563 drop:
 564         kfree_skb(skb);
 565         return -EINVAL;
 566 }
 567
 568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 569 {
 570         to->pkt_type = from->pkt_type;
 571         to->priority = from->priority;
 572         to->protocol = from->protocol;
 573         skb_dst_drop(to);
 574         skb_dst_set(to, dst_clone(skb_dst(from)));
 575         to->dev = from->dev;
 576         to->mark = from->mark;
 577
 578 #ifdef CONFIG_NET_SCHED
 579         to->tc_index = from->tc_index;
 580 #endif
 581         nf_copy(to, from);
 582         skb_copy_secmark(to, from);
 583 }
 584
 585 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 586                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 587 {
 588         struct sk_buff *frag;
 589         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 590         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 591                                 inet6_sk(skb->sk) : NULL;
 592         struct ipv6hdr *tmp_hdr;
 593         struct frag_hdr *fh;
 594         unsigned int mtu, hlen, left, len;
 595         int hroom, troom;
 596         __be32 frag_id;
 597         int ptr, offset = 0, err = 0;
 598         u8 *prevhdr, nexthdr = 0;
 599
 600         err = ip6_find_1stfragopt(skb, &prevhdr);
 601         if (err < 0)
 602                 goto fail;
 603         hlen = err;
 604         nexthdr = *prevhdr;
 605
 606         mtu = ip6_skb_dst_mtu(skb);
 607
 608         /* We must not fragment if the socket is set to force MTU discovery
 609          * or if the skb it not generated by a local socket.
 610          */
 611         if (unlikely(!skb->ignore_df && skb->len > mtu))
 612                 goto fail_toobig;
 613
 614         if (IP6CB(skb)->frag_max_size) {
 615                 if (IP6CB(skb)->frag_max_size > mtu)
 616                         goto fail_toobig;
 617
 618                 /* don't send fragments larger than what we received */
 619                 mtu = IP6CB(skb)->frag_max_size;
 620                 if (mtu < IPV6_MIN_MTU)
 621                         mtu = IPV6_MIN_MTU;
 622         }
 623
 624         if (np && np->frag_size < mtu) {
 625                 if (np->frag_size)
 626                         mtu = np->frag_size;
 627         }
 628         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 629                 goto fail_toobig;
 630         mtu -= hlen + sizeof(struct frag_hdr);
 631
 632         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 633                                     &ipv6_hdr(skb)->saddr);
 634
 635         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 636             (err = skb_checksum_help(skb)))
 637                 goto fail;
 638
 639         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 640         if (skb_has_frag_list(skb)) {
 641                 unsigned int first_len = skb_pagelen(skb);
 642                 struct sk_buff *frag2;
 643
 644                 if (first_len - hlen > mtu ||
 645                     ((first_len - hlen) & 7) ||
 646                     skb_cloned(skb) ||
 647                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 648                         goto slow_path;
 649
 650                 skb_walk_frags(skb, frag) {
 651                         /* Correct geometry. */
 652                         if (frag->len > mtu ||
 653                             ((frag->len & 7) && frag->next) ||
 654                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 655                                 goto slow_path_clean;
 656
 657                         /* Partially cloned skb? */
 658                         if (skb_shared(frag))
 659                                 goto slow_path_clean;
 660
 661                         BUG_ON(frag->sk);
 662                         if (skb->sk) {
 663                                 frag->sk = skb->sk;
 664                                 frag->destructor = sock_wfree;
 665                         }
 666                         skb->truesize -= frag->truesize;
 667                 }
 668
 669                 err = 0;
 670                 offset = 0;
 671                 /* BUILD HEADER */
 672
 673                 *prevhdr = NEXTHDR_FRAGMENT;
 674                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 675                 if (!tmp_hdr) {
 676                         err = -ENOMEM;
 677                         goto fail;
 678                 }
 679                 frag = skb_shinfo(skb)->frag_list;
 680                 skb_frag_list_init(skb);
 681
 682                 __skb_pull(skb, hlen);
 683                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 684                 __skb_push(skb, hlen);
 685                 skb_reset_network_header(skb);
 686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                 fh->nexthdr = nexthdr;
 689                 fh->reserved = 0;
 690                 fh->frag_off = htons(IP6_MF);
 691                 fh->identification = frag_id;
 692
 693                 first_len = skb_pagelen(skb);
 694                 skb->data_len = first_len - skb_headlen(skb);
 695                 skb->len = first_len;
 696                 ipv6_hdr(skb)->payload_len = htons(first_len -
 697                                                    sizeof(struct ipv6hdr));
 698
 699                 for (;;) {
 700                         /* Prepare header of the next frame,
 701                          * before previous one went down. */
 702                         if (frag) {
 703                                 frag->ip_summed = CHECKSUM_NONE;
 704                                 skb_reset_transport_header(frag);
 705                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 706                                 __skb_push(frag, hlen);
 707                                 skb_reset_network_header(frag);
 708                                 memcpy(skb_network_header(frag), tmp_hdr,
 709                                        hlen);
 710                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 711                                 fh->nexthdr = nexthdr;
 712                                 fh->reserved = 0;
 713                                 fh->frag_off = htons(offset);
 714                                 if (frag->next)
 715                                         fh->frag_off |= htons(IP6_MF);
 716                                 fh->identification = frag_id;
 717                                 ipv6_hdr(frag)->payload_len =
 718                                                 htons(frag->len -
 719                                                       sizeof(struct ipv6hdr));
 720                                 ip6_copy_metadata(frag, skb);
 721                         }
 722
 723                         err = output(net, sk, skb);
 724                         if (!err)
 725                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 726                                               IPSTATS_MIB_FRAGCREATES);
 727
 728                         if (err || !frag)
 729                                 break;
 730
 731                         skb = frag;
 732                         frag = skb->next;
 733                         skb->next = NULL;
 734                 }
 735
 736                 kfree(tmp_hdr);
 737
 738                 if (err == 0) {
 739                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 740                                       IPSTATS_MIB_FRAGOKS);
 741                         return 0;
 742                 }
 743
 744                 kfree_skb_list(frag);
 745
 746                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 747                               IPSTATS_MIB_FRAGFAILS);
 748                 return err;
 749
 750 slow_path_clean:
 751                 skb_walk_frags(skb, frag2) {
 752                         if (frag2 == frag)
 753                                 break;
 754                         frag2->sk = NULL;
 755                         frag2->destructor = NULL;
 756                         skb->truesize += frag2->truesize;
 757                 }
 758         }
 759
 760 slow_path:
 761         left = skb->len - hlen;         /* Space per frame */
 762         ptr = hlen;                     /* Where to start from */
 763
 764         /*
 765          *      Fragment the datagram.
 766          */
 767
 768         troom = rt->dst.dev->needed_tailroom;
 769
 770         /*
 771          *      Keep copying data until we run out.
 772          */
 773         while (left > 0)        {
 774                 u8 *fragnexthdr_offset;
 775
 776                 len = left;
 777                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 778                 if (len > mtu)
 779                         len = mtu;
 780                 /* IF: we are not sending up to and including the packet end
 781                    then align the next start on an eight byte boundary */
 782                 if (len < left) {
 783                         len &= ~7;
 784                 }
 785
 786                 /* Allocate buffer */
 787                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 788                                  hroom + troom, GFP_ATOMIC);
 789                 if (!frag) {
 790                         err = -ENOMEM;
 791                         goto fail;
 792                 }
 793
 794                 /*
 795                  *      Set up data on packet
 796                  */
 797
 798                 ip6_copy_metadata(frag, skb);
 799                 skb_reserve(frag, hroom);
 800                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 801                 skb_reset_network_header(frag);
 802                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 803                 frag->transport_header = (frag->network_header + hlen +
 804                                           sizeof(struct frag_hdr));
 805
 806                 /*
 807                  *      Charge the memory for the fragment to any owner
 808                  *      it might possess
 809                  */
 810                 if (skb->sk)
 811                         skb_set_owner_w(frag, skb->sk);
 812
 813                 /*
 814                  *      Copy the packet header into the new buffer.
 815                  */
 816                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 817
 818                 fragnexthdr_offset = skb_network_header(frag);
 819                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 820                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 821
 822                 /*
 823                  *      Build fragment header.
 824                  */
 825                 fh->nexthdr = nexthdr;
 826                 fh->reserved = 0;
 827                 fh->identification = frag_id;
 828
 829                 /*
 830                  *      Copy a block of the IP datagram.
 831                  */
 832                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 833                                      len));
 834                 left -= len;
 835
 836                 fh->frag_off = htons(offset);
 837                 if (left > 0)
 838                         fh->frag_off |= htons(IP6_MF);
 839                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 840                                                     sizeof(struct ipv6hdr));
 841
 842                 ptr += len;
 843                 offset += len;
 844
 845                 /*
 846                  *      Put this fragment into the sending queue.
 847                  */
 848                 err = output(net, sk, frag);
 849                 if (err)
 850                         goto fail;
 851
 852                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 853                               IPSTATS_MIB_FRAGCREATES);
 854         }
 855         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 856                       IPSTATS_MIB_FRAGOKS);
 857         consume_skb(skb);
 858         return err;
 859
 860 fail_toobig:
 861         if (skb->sk && dst_allfrag(skb_dst(skb)))
 862                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 863
 864         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 865         err = -EMSGSIZE;
 866
 867 fail:
 868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 869                       IPSTATS_MIB_FRAGFAILS);
 870         kfree_skb(skb);
 871         return err;
 872 }
 873
 874 static inline int ip6_rt_check(const struct rt6key *rt_key,
 875                                const struct in6_addr *fl_addr,
 876                                const struct in6_addr *addr_cache)
 877 {
 878         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 879                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 880 }
 881
 882 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 883                                           struct dst_entry *dst,
 884                                           const struct flowi6 *fl6)
 885 {
 886         struct ipv6_pinfo *np = inet6_sk(sk);
 887         struct rt6_info *rt;
 888
 889         if (!dst)
 890                 goto out;
 891
 892         if (dst->ops->family != AF_INET6) {
 893                 dst_release(dst);
 894                 return NULL;
 895         }
 896
 897         rt = (struct rt6_info *)dst;
 898         /* Yes, checking route validity in not connected
 899          * case is not very simple. Take into account,
 900          * that we do not support routing by source, TOS,
 901          * and MSG_DONTROUTE            --ANK (980726)
 902          *
 903          * 1. ip6_rt_check(): If route was host route,
 904          *    check that cached destination is current.
 905          *    If it is network route, we still may
 906          *    check its validity using saved pointer
 907          *    to the last used address: daddr_cache.
 908          *    We do not want to save whole address now,
 909          *    (because main consumer of this service
 910          *    is tcp, which has not this problem),
 911          *    so that the last trick works only on connected
 912          *    sockets.
 913          * 2. oif also should be the same.
 914          */
 915         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 916 #ifdef CONFIG_IPV6_SUBTREES
 917             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 918 #endif
 919            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 920               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 921                 dst_release(dst);
 922                 dst = NULL;
 923         }
 924
 925 out:
 926         return dst;
 927 }
 928
 929 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 930                                struct dst_entry **dst, struct flowi6 *fl6)
 931 {
 932 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 933         struct neighbour *n;
 934         struct rt6_info *rt;
 935 #endif
 936         int err;
 937         int flags = 0;
 938
 939         /* The correct way to handle this would be to do
 940          * ip6_route_get_saddr, and then ip6_route_output; however,
 941          * the route-specific preferred source forces the
 942          * ip6_route_output call _before_ ip6_route_get_saddr.
 943          *
 944          * In source specific routing (no src=any default route),
 945          * ip6_route_output will fail given src=any saddr, though, so
 946          * that's why we try it again later.
 947          */
 948         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 949                 struct rt6_info *rt;
 950                 bool had_dst = *dst != NULL;
 951
 952                 if (!had_dst)
 953                         *dst = ip6_route_output(net, sk, fl6);
 954                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 955                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 956                                           sk ? inet6_sk(sk)->srcprefs : 0,
 957                                           &fl6->saddr);
 958                 if (err)
 959                         goto out_err_release;
 960
 961                 /* If we had an erroneous initial result, pretend it
 962                  * never existed and let the SA-enabled version take
 963                  * over.
 964                  */
 965                 if (!had_dst && (*dst)->error) {
 966                         dst_release(*dst);
 967                         *dst = NULL;
 968                 }
 969
 970                 if (fl6->flowi6_oif)
 971                         flags |= RT6_LOOKUP_F_IFACE;
 972         }
 973
 974         if (!*dst)
 975                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 976
 977         err = (*dst)->error;
 978         if (err)
 979                 goto out_err_release;
 980
 981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 982         /*
 983          * Here if the dst entry we've looked up
 984          * has a neighbour entry that is in the INCOMPLETE
 985          * state and the src address from the flow is
 986          * marked as OPTIMISTIC, we release the found
 987          * dst entry and replace it instead with the
 988          * dst entry of the nexthop router
 989          */
 990         rt = (struct rt6_info *) *dst;
 991         rcu_read_lock_bh();
 992         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 993                                       rt6_nexthop(rt, &fl6->daddr));
 994         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 995         rcu_read_unlock_bh();
 996
 997         if (err) {
 998                 struct inet6_ifaddr *ifp;
 999                 struct flowi6 fl_gw6;
1000                 int redirect;
1001
1002                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                       (*dst)->dev, 1);
1004
1005                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                 if (ifp)
1007                         in6_ifa_put(ifp);
1008
1009                 if (redirect) {
1010                         /*
1011                          * We need to get the dst entry for the
1012                          * default router instead
1013                          */
1014                         dst_release(*dst);
1015                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                         *dst = ip6_route_output(net, sk, &fl_gw6);
1018                         err = (*dst)->error;
1019                         if (err)
1020                                 goto out_err_release;
1021                 }
1022         }
1023 #endif
1024         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026                 err = -EAFNOSUPPORT;
1027                 goto out_err_release;
1028         }
1029
1030         return 0;
1031
1032 out_err_release:
1033         dst_release(*dst);
1034         *dst = NULL;
1035
1036         if (err == -ENETUNREACH)
1037                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038         return err;
1039 }
1040
1041 /**
1042  *      ip6_dst_lookup - perform route lookup on flow
1043  *      @sk: socket which provides route info
1044  *      @dst: pointer to dst_entry * for result
1045  *      @fl6: flow to lookup
1046  *
1047  *      This function performs a route lookup on the given flow.
1048  *
1049  *      It returns zero on success, or a standard errno code on error.
1050  */
1051 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052                    struct flowi6 *fl6)
1053 {
1054         *dst = NULL;
1055         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056 }
1057 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058
1059 /**
1060  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061  *      @sk: socket which provides route info
1062  *      @fl6: flow to lookup
1063  *      @final_dst: final destination address for ipsec lookup
1064  *
1065  *      This function performs a route lookup on the given flow.
1066  *
1067  *      It returns a valid dst pointer on success, or a pointer encoded
1068  *      error code.
1069  */
1070 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071                                       const struct in6_addr *final_dst)
1072 {
1073         struct dst_entry *dst = NULL;
1074         int err;
1075
1076         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077         if (err)
1078                 return ERR_PTR(err);
1079         if (final_dst)
1080                 fl6->daddr = *final_dst;
1081
1082         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083 }
1084 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085
1086 /**
1087  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088  *      @sk: socket which provides the dst cache and route info
1089  *      @fl6: flow to lookup
1090  *      @final_dst: final destination address for ipsec lookup
1091  *
1092  *      This function performs a route lookup on the given flow with the
1093  *      possibility of using the cached route in the socket if it is valid.
1094  *      It will take the socket dst lock when operating on the dst cache.
1095  *      As a result, this function can only be used in process context.
1096  *
1097  *      It returns a valid dst pointer on success, or a pointer encoded
1098  *      error code.
1099  */
1100 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1101                                          const struct in6_addr *final_dst)
1102 {
1103         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1104
1105         dst = ip6_sk_dst_check(sk, dst, fl6);
1106         if (!dst)
1107                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1108
1109         return dst;
1110 }
1111 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112
1113 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1114                                                gfp_t gfp)
1115 {
1116         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1117 }
1118
1119 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1120                                                 gfp_t gfp)
1121 {
1122         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1123 }
1124
1125 static void ip6_append_data_mtu(unsigned int *mtu,
1126                                 int *maxfraglen,
1127                                 unsigned int fragheaderlen,
1128                                 struct sk_buff *skb,
1129                                 struct rt6_info *rt,
1130                                 unsigned int orig_mtu)
1131 {
1132         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1133                 if (!skb) {
1134                         /* first fragment, reserve header_len */
1135                         *mtu = orig_mtu - rt->dst.header_len;
1136
1137                 } else {
1138                         /*
1139                          * this fragment is not first, the headers
1140                          * space is regarded as data space.
1141                          */
1142                         *mtu = orig_mtu;
1143                 }
1144                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1145                               + fragheaderlen - sizeof(struct frag_hdr);
1146         }
1147 }
1148
1149 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1150                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1151                           struct rt6_info *rt, struct flowi6 *fl6)
1152 {
1153         struct ipv6_pinfo *np = inet6_sk(sk);
1154         unsigned int mtu;
1155         struct ipv6_txoptions *opt = ipc6->opt;
1156
1157         /*
1158          * setup for corking
1159          */
1160         if (opt) {
1161                 if (WARN_ON(v6_cork->opt))
1162                         return -EINVAL;
1163
1164                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1165                 if (unlikely(!v6_cork->opt))
1166                         return -ENOBUFS;
1167
1168                 v6_cork->opt->tot_len = opt->tot_len;
1169                 v6_cork->opt->opt_flen = opt->opt_flen;
1170                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1171
1172                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1173                                                     sk->sk_allocation);
1174                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1175                         return -ENOBUFS;
1176
1177                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1178                                                     sk->sk_allocation);
1179                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1180                         return -ENOBUFS;
1181
1182                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1183                                                    sk->sk_allocation);
1184                 if (opt->hopopt && !v6_cork->opt->hopopt)
1185                         return -ENOBUFS;
1186
1187                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1188                                                     sk->sk_allocation);
1189                 if (opt->srcrt && !v6_cork->opt->srcrt)
1190                         return -ENOBUFS;
1191
1192                 /* need source address above miyazawa*/
1193         }
1194         dst_hold(&rt->dst);
1195         cork->base.dst = &rt->dst;
1196         cork->fl.u.ip6 = *fl6;
1197         v6_cork->hop_limit = ipc6->hlimit;
1198         v6_cork->tclass = ipc6->tclass;
1199         if (rt->dst.flags & DST_XFRM_TUNNEL)
1200                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1201                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1202         else
1203                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1204                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1205         if (np->frag_size < mtu) {
1206                 if (np->frag_size)
1207                         mtu = np->frag_size;
1208         }
1209         cork->base.fragsize = mtu;
1210         if (dst_allfrag(rt->dst.path))
1211                 cork->base.flags |= IPCORK_ALLFRAG;
1212         cork->base.length = 0;
1213
1214         return 0;
1215 }
1216
1217 static int __ip6_append_data(struct sock *sk,
1218                              struct flowi6 *fl6,
1219                              struct sk_buff_head *queue,
1220                              struct inet_cork *cork,
1221                              struct inet6_cork *v6_cork,
1222                              struct page_frag *pfrag,
1223                              int getfrag(void *from, char *to, int offset,
1224                                          int len, int odd, struct sk_buff *skb),
1225                              void *from, int length, int transhdrlen,
1226                              unsigned int flags, struct ipcm6_cookie *ipc6,
1227                              const struct sockcm_cookie *sockc)
1228 {
1229         struct sk_buff *skb, *skb_prev = NULL;
1230         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1231         int exthdrlen = 0;
1232         int dst_exthdrlen = 0;
1233         int hh_len;
1234         int copy;
1235         int err;
1236         int offset = 0;
1237         __u8 tx_flags = 0;
1238         u32 tskey = 0;
1239         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1240         struct ipv6_txoptions *opt = v6_cork->opt;
1241         int csummode = CHECKSUM_NONE;
1242         unsigned int maxnonfragsize, headersize;
1243
1244         skb = skb_peek_tail(queue);
1245         if (!skb) {
1246                 exthdrlen = opt ? opt->opt_flen : 0;
1247                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1248         }
1249
1250         mtu = cork->fragsize;
1251         orig_mtu = mtu;
1252
1253         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1254
1255         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1256                         (opt ? opt->opt_nflen : 0);
1257         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1258                      sizeof(struct frag_hdr);
1259
1260         headersize = sizeof(struct ipv6hdr) +
1261                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1262                      (dst_allfrag(&rt->dst) ?
1263                       sizeof(struct frag_hdr) : 0) +
1264                      rt->rt6i_nfheader_len;
1265
1266         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1267             (sk->sk_protocol == IPPROTO_UDP ||
1268              sk->sk_protocol == IPPROTO_RAW)) {
1269                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1270                                 sizeof(struct ipv6hdr));
1271                 goto emsgsize;
1272         }
1273
1274         if (ip6_sk_ignore_df(sk))
1275                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1276         else
1277                 maxnonfragsize = mtu;
1278
1279         if (cork->length + length > maxnonfragsize - headersize) {
1280 emsgsize:
1281                 ipv6_local_error(sk, EMSGSIZE, fl6,
1282                                  mtu - headersize +
1283                                  sizeof(struct ipv6hdr));
1284                 return -EMSGSIZE;
1285         }
1286
1287         /* CHECKSUM_PARTIAL only with no extension headers and when
1288          * we are not going to fragment
1289          */
1290         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1291             headersize == sizeof(struct ipv6hdr) &&
1292             length <= mtu - headersize &&
1293             !(flags & MSG_MORE) &&
1294             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1295                 csummode = CHECKSUM_PARTIAL;
1296
1297         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1298                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1299                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1300                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1301                         tskey = sk->sk_tskey++;
1302         }
1303
1304         /*
1305          * Let's try using as much space as possible.
1306          * Use MTU if total length of the message fits into the MTU.
1307          * Otherwise, we need to reserve fragment header and
1308          * fragment alignment (= 8-15 octects, in total).
1309          *
1310          * Note that we may need to "move" the data from the tail of
1311          * of the buffer to the new fragment when we split
1312          * the message.
1313          *
1314          * FIXME: It may be fragmented into multiple chunks
1315          *        at once if non-fragmentable extension headers
1316          *        are too large.
1317          * --yoshfuji
1318          */
1319
1320         cork->length += length;
1321         if (!skb)
1322                 goto alloc_new_skb;
1323
1324         while (length > 0) {
1325                 /* Check if the remaining data fits into current packet. */
1326                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1327                 if (copy < length)
1328                         copy = maxfraglen - skb->len;
1329
1330                 if (copy <= 0) {
1331                         char *data;
1332                         unsigned int datalen;
1333                         unsigned int fraglen;
1334                         unsigned int fraggap;
1335                         unsigned int alloclen;
1336 alloc_new_skb:
1337                         /* There's no room in the current skb */
1338                         if (skb)
1339                                 fraggap = skb->len - maxfraglen;
1340                         else
1341                                 fraggap = 0;
1342                         /* update mtu and maxfraglen if necessary */
1343                         if (!skb || !skb_prev)
1344                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1345                                                     fragheaderlen, skb, rt,
1346                                                     orig_mtu);
1347
1348                         skb_prev = skb;
1349
1350                         /*
1351                          * If remaining data exceeds the mtu,
1352                          * we know we need more fragment(s).
1353                          */
1354                         datalen = length + fraggap;
1355
1356                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1357                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1358                         if ((flags & MSG_MORE) &&
1359                             !(rt->dst.dev->features&NETIF_F_SG))
1360                                 alloclen = mtu;
1361                         else
1362                                 alloclen = datalen + fragheaderlen;
1363
1364                         alloclen += dst_exthdrlen;
1365
1366                         if (datalen != length + fraggap) {
1367                                 /*
1368                                  * this is not the last fragment, the trailer
1369                                  * space is regarded as data space.
1370                                  */
1371                                 datalen += rt->dst.trailer_len;
1372                         }
1373
1374                         alloclen += rt->dst.trailer_len;
1375                         fraglen = datalen + fragheaderlen;
1376
1377                         /*
1378                          * We just reserve space for fragment header.
1379                          * Note: this may be overallocation if the message
1380                          * (without MSG_MORE) fits into the MTU.
1381                          */
1382                         alloclen += sizeof(struct frag_hdr);
1383
1384                         copy = datalen - transhdrlen - fraggap;
1385                         if (copy < 0) {
1386                                 err = -EINVAL;
1387                                 goto error;
1388                         }
1389                         if (transhdrlen) {
1390                                 skb = sock_alloc_send_skb(sk,
1391                                                 alloclen + hh_len,
1392                                                 (flags & MSG_DONTWAIT), &err);
1393                         } else {
1394                                 skb = NULL;
1395                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1396                                     2 * sk->sk_sndbuf)
1397                                         skb = sock_wmalloc(sk,
1398                                                            alloclen + hh_len, 1,
1399                                                            sk->sk_allocation);
1400                                 if (unlikely(!skb))
1401                                         err = -ENOBUFS;
1402                         }
1403                         if (!skb)
1404                                 goto error;
1405                         /*
1406                          *      Fill in the control structures
1407                          */
1408                         skb->protocol = htons(ETH_P_IPV6);
1409                         skb->ip_summed = csummode;
1410                         skb->csum = 0;
1411                         /* reserve for fragmentation and ipsec header */
1412                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1413                                     dst_exthdrlen);
1414
1415                         /* Only the initial fragment is time stamped */
1416                         skb_shinfo(skb)->tx_flags = tx_flags;
1417                         tx_flags = 0;
1418                         skb_shinfo(skb)->tskey = tskey;
1419                         tskey = 0;
1420
1421                         /*
1422                          *      Find where to start putting bytes
1423                          */
1424                         data = skb_put(skb, fraglen);
1425                         skb_set_network_header(skb, exthdrlen);
1426                         data += fragheaderlen;
1427                         skb->transport_header = (skb->network_header +
1428                                                  fragheaderlen);
1429                         if (fraggap) {
1430                                 skb->csum = skb_copy_and_csum_bits(
1431                                         skb_prev, maxfraglen,
1432                                         data + transhdrlen, fraggap, 0);
1433                                 skb_prev->csum = csum_sub(skb_prev->csum,
1434                                                           skb->csum);
1435                                 data += fraggap;
1436                                 pskb_trim_unique(skb_prev, maxfraglen);
1437                         }
1438                         if (copy > 0 &&
1439                             getfrag(from, data + transhdrlen, offset,
1440                                     copy, fraggap, skb) < 0) {
1441                                 err = -EFAULT;
1442                                 kfree_skb(skb);
1443                                 goto error;
1444                         }
1445
1446                         offset += copy;
1447                         length -= datalen - fraggap;
1448                         transhdrlen = 0;
1449                         exthdrlen = 0;
1450                         dst_exthdrlen = 0;
1451
1452                         if ((flags & MSG_CONFIRM) && !skb_prev)
1453                                 skb_set_dst_pending_confirm(skb, 1);
1454
1455                         /*
1456                          * Put the packet on the pending queue
1457                          */
1458                         __skb_queue_tail(queue, skb);
1459                         continue;
1460                 }
1461
1462                 if (copy > length)
1463                         copy = length;
1464
1465                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1466                         unsigned int off;
1467
1468                         off = skb->len;
1469                         if (getfrag(from, skb_put(skb, copy),
1470                                                 offset, copy, off, skb) < 0) {
1471                                 __skb_trim(skb, off);
1472                                 err = -EFAULT;
1473                                 goto error;
1474                         }
1475                 } else {
1476                         int i = skb_shinfo(skb)->nr_frags;
1477
1478                         err = -ENOMEM;
1479                         if (!sk_page_frag_refill(sk, pfrag))
1480                                 goto error;
1481
1482                         if (!skb_can_coalesce(skb, i, pfrag->page,
1483                                               pfrag->offset)) {
1484                                 err = -EMSGSIZE;
1485                                 if (i == MAX_SKB_FRAGS)
1486                                         goto error;
1487
1488                                 __skb_fill_page_desc(skb, i, pfrag->page,
1489                                                      pfrag->offset, 0);
1490                                 skb_shinfo(skb)->nr_frags = ++i;
1491                                 get_page(pfrag->page);
1492                         }
1493                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1494                         if (getfrag(from,
1495                                     page_address(pfrag->page) + pfrag->offset,
1496                                     offset, copy, skb->len, skb) < 0)
1497                                 goto error_efault;
1498
1499                         pfrag->offset += copy;
1500                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1501                         skb->len += copy;
1502                         skb->data_len += copy;
1503                         skb->truesize += copy;
1504                         refcount_add(copy, &sk->sk_wmem_alloc);
1505                 }
1506                 offset += copy;
1507                 length -= copy;
1508         }
1509
1510         return 0;
1511
1512 error_efault:
1513         err = -EFAULT;
1514 error:
1515         cork->length -= length;
1516         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1517         return err;
1518 }
1519
1520 int ip6_append_data(struct sock *sk,
1521                     int getfrag(void *from, char *to, int offset, int len,
1522                                 int odd, struct sk_buff *skb),
1523                     void *from, int length, int transhdrlen,
1524                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1525                     struct rt6_info *rt, unsigned int flags,
1526                     const struct sockcm_cookie *sockc)
1527 {
1528         struct inet_sock *inet = inet_sk(sk);
1529         struct ipv6_pinfo *np = inet6_sk(sk);
1530         int exthdrlen;
1531         int err;
1532
1533         if (flags&MSG_PROBE)
1534                 return 0;
1535         if (skb_queue_empty(&sk->sk_write_queue)) {
1536                 /*
1537                  * setup for corking
1538                  */
1539                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1540                                      ipc6, rt, fl6);
1541                 if (err)
1542                         return err;
1543
1544                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1545                 length += exthdrlen;
1546                 transhdrlen += exthdrlen;
1547         } else {
1548                 fl6 = &inet->cork.fl.u.ip6;
1549                 transhdrlen = 0;
1550         }
1551
1552         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1553                                  &np->cork, sk_page_frag(sk), getfrag,
1554                                  from, length, transhdrlen, flags, ipc6, sockc);
1555 }
1556 EXPORT_SYMBOL_GPL(ip6_append_data);
1557
1558 static void ip6_cork_release(struct inet_cork_full *cork,
1559                              struct inet6_cork *v6_cork)
1560 {
1561         if (v6_cork->opt) {
1562                 kfree(v6_cork->opt->dst0opt);
1563                 kfree(v6_cork->opt->dst1opt);
1564                 kfree(v6_cork->opt->hopopt);
1565                 kfree(v6_cork->opt->srcrt);
1566                 kfree(v6_cork->opt);
1567                 v6_cork->opt = NULL;
1568         }
1569
1570         if (cork->base.dst) {
1571                 dst_release(cork->base.dst);
1572                 cork->base.dst = NULL;
1573                 cork->base.flags &= ~IPCORK_ALLFRAG;
1574         }
1575         memset(&cork->fl, 0, sizeof(cork->fl));
1576 }
1577
1578 struct sk_buff *__ip6_make_skb(struct sock *sk,
1579                                struct sk_buff_head *queue,
1580                                struct inet_cork_full *cork,
1581                                struct inet6_cork *v6_cork)
1582 {
1583         struct sk_buff *skb, *tmp_skb;
1584         struct sk_buff **tail_skb;
1585         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1586         struct ipv6_pinfo *np = inet6_sk(sk);
1587         struct net *net = sock_net(sk);
1588         struct ipv6hdr *hdr;
1589         struct ipv6_txoptions *opt = v6_cork->opt;
1590         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1591         struct flowi6 *fl6 = &cork->fl.u.ip6;
1592         unsigned char proto = fl6->flowi6_proto;
1593
1594         skb = __skb_dequeue(queue);
1595         if (!skb)
1596                 goto out;
1597         tail_skb = &(skb_shinfo(skb)->frag_list);
1598
1599         /* move skb->data to ip header from ext header */
1600         if (skb->data < skb_network_header(skb))
1601                 __skb_pull(skb, skb_network_offset(skb));
1602         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1603                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1604                 *tail_skb = tmp_skb;
1605                 tail_skb = &(tmp_skb->next);
1606                 skb->len += tmp_skb->len;
1607                 skb->data_len += tmp_skb->len;
1608                 skb->truesize += tmp_skb->truesize;
1609                 tmp_skb->destructor = NULL;
1610                 tmp_skb->sk = NULL;
1611         }
1612
1613         /* Allow local fragmentation. */
1614         skb->ignore_df = ip6_sk_ignore_df(sk);
1615
1616         *final_dst = fl6->daddr;
1617         __skb_pull(skb, skb_network_header_len(skb));
1618         if (opt && opt->opt_flen)
1619                 ipv6_push_frag_opts(skb, opt, &proto);
1620         if (opt && opt->opt_nflen)
1621                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1622
1623         skb_push(skb, sizeof(struct ipv6hdr));
1624         skb_reset_network_header(skb);
1625         hdr = ipv6_hdr(skb);
1626
1627         ip6_flow_hdr(hdr, v6_cork->tclass,
1628                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1629                                         np->autoflowlabel, fl6));
1630         hdr->hop_limit = v6_cork->hop_limit;
1631         hdr->nexthdr = proto;
1632         hdr->saddr = fl6->saddr;
1633         hdr->daddr = *final_dst;
1634
1635         skb->priority = sk->sk_priority;
1636         skb->mark = sk->sk_mark;
1637
1638         skb_dst_set(skb, dst_clone(&rt->dst));
1639         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1640         if (proto == IPPROTO_ICMPV6) {
1641                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1642
1643                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1644                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1645         }
1646
1647         ip6_cork_release(cork, v6_cork);
1648 out:
1649         return skb;
1650 }
1651
1652 int ip6_send_skb(struct sk_buff *skb)
1653 {
1654         struct net *net = sock_net(skb->sk);
1655         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1656         int err;
1657
1658         err = ip6_local_out(net, skb->sk, skb);
1659         if (err) {
1660                 if (err > 0)
1661                         err = net_xmit_errno(err);
1662                 if (err)
1663                         IP6_INC_STATS(net, rt->rt6i_idev,
1664                                       IPSTATS_MIB_OUTDISCARDS);
1665         }
1666
1667         return err;
1668 }
1669
1670 int ip6_push_pending_frames(struct sock *sk)
1671 {
1672         struct sk_buff *skb;
1673
1674         skb = ip6_finish_skb(sk);
1675         if (!skb)
1676                 return 0;
1677
1678         return ip6_send_skb(skb);
1679 }
1680 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1681
1682 static void __ip6_flush_pending_frames(struct sock *sk,
1683                                        struct sk_buff_head *queue,
1684                                        struct inet_cork_full *cork,
1685                                        struct inet6_cork *v6_cork)
1686 {
1687         struct sk_buff *skb;
1688
1689         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1690                 if (skb_dst(skb))
1691                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1692                                       IPSTATS_MIB_OUTDISCARDS);
1693                 kfree_skb(skb);
1694         }
1695
1696         ip6_cork_release(cork, v6_cork);
1697 }
1698
1699 void ip6_flush_pending_frames(struct sock *sk)
1700 {
1701         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1702                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1703 }
1704 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1705
1706 struct sk_buff *ip6_make_skb(struct sock *sk,
1707                              int getfrag(void *from, char *to, int offset,
1708                                          int len, int odd, struct sk_buff *skb),
1709                              void *from, int length, int transhdrlen,
1710                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1711                              struct rt6_info *rt, unsigned int flags,
1712                              const struct sockcm_cookie *sockc)
1713 {
1714         struct inet_cork_full cork;
1715         struct inet6_cork v6_cork;
1716         struct sk_buff_head queue;
1717         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1718         int err;
1719
1720         if (flags & MSG_PROBE)
1721                 return NULL;
1722
1723         __skb_queue_head_init(&queue);
1724
1725         cork.base.flags = 0;
1726         cork.base.addr = 0;
1727         cork.base.opt = NULL;
1728         v6_cork.opt = NULL;
1729         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1730         if (err)
1731                 return ERR_PTR(err);
1732
1733         if (ipc6->dontfrag < 0)
1734                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1735
1736         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1737                                 &current->task_frag, getfrag, from,
1738                                 length + exthdrlen, transhdrlen + exthdrlen,
1739                                 flags, ipc6, sockc);
1740         if (err) {
1741                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1742                 return ERR_PTR(err);
1743         }
1744
1745         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1746 }