net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int __ip6_local_out(struct sk_buff *skb)
  60 {
  61         int len;
  62
  63         len = skb->len - sizeof(struct ipv6hdr);
  64         if (len > IPV6_MAXPLEN)
  65                 len = 0;
  66         ipv6_hdr(skb)->payload_len = htons(len);
  67
  68         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  69                        skb_dst(skb)->dev, dst_output);
  70 }
  71
  72 int ip6_local_out(struct sk_buff *skb)
  73 {
  74         int err;
  75
  76         err = __ip6_local_out(skb);
  77         if (likely(err == 1))
  78                 err = dst_output(skb);
  79
  80         return err;
  81 }
  82 EXPORT_SYMBOL_GPL(ip6_local_out);
  83
  84 static int ip6_finish_output2(struct sk_buff *skb)
  85 {
  86         struct dst_entry *dst = skb_dst(skb);
  87         struct net_device *dev = dst->dev;
  88         struct neighbour *neigh;
  89         struct in6_addr *nexthop;
  90         int ret;
  91
  92         skb->protocol = htons(ETH_P_IPV6);
  93         skb->dev = dev;
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  99                     ((mroute6_socket(dev_net(dev), skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(dev_net(dev), idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 122                                 skb->len);
 123
 124                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 125                     IPV6_ADDR_SCOPE_NODELOCAL &&
 126                     !(dev->flags & IFF_LOOPBACK)) {
 127                         kfree_skb(skb);
 128                         return 0;
 129                 }
 130         }
 131
 132         rcu_read_lock_bh();
 133         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 134         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 135         if (unlikely(!neigh))
 136                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 137         if (!IS_ERR(neigh)) {
 138                 ret = dst_neigh_output(dst, neigh, skb);
 139                 rcu_read_unlock_bh();
 140                 return ret;
 141         }
 142         rcu_read_unlock_bh();
 143
 144         IP6_INC_STATS_BH(dev_net(dst->dev),
 145                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 146         kfree_skb(skb);
 147         return -EINVAL;
 148 }
 149
 150 static int ip6_finish_output(struct sk_buff *skb)
 151 {
 152         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 153             dst_allfrag(skb_dst(skb)))
 154                 return ip6_fragment(skb, ip6_finish_output2);
 155         else
 156                 return ip6_finish_output2(skb);
 157 }
 158
 159 int ip6_output(struct sk_buff *skb)
 160 {
 161         struct net_device *dev = skb_dst(skb)->dev;
 162         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 163         if (unlikely(idev->cnf.disable_ipv6)) {
 164                 IP6_INC_STATS(dev_net(dev), idev,
 165                               IPSTATS_MIB_OUTDISCARDS);
 166                 kfree_skb(skb);
 167                 return 0;
 168         }
 169
 170         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 171                             ip6_finish_output,
 172                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 173 }
 174
 175 /*
 176  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 177  */
 178
 179 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 180              struct ipv6_txoptions *opt, int tclass)
 181 {
 182         struct net *net = sock_net(sk);
 183         struct ipv6_pinfo *np = inet6_sk(sk);
 184         struct in6_addr *first_hop = &fl6->daddr;
 185         struct dst_entry *dst = skb_dst(skb);
 186         struct ipv6hdr *hdr;
 187         u8  proto = fl6->flowi6_proto;
 188         int seg_len = skb->len;
 189         int hlimit = -1;
 190         u32 mtu;
 191
 192         if (opt) {
 193                 unsigned int head_room;
 194
 195                 /* First: exthdrs may take lots of space (~8K for now)
 196                    MAX_HEADER is not enough.
 197                  */
 198                 head_room = opt->opt_nflen + opt->opt_flen;
 199                 seg_len += head_room;
 200                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 201
 202                 if (skb_headroom(skb) < head_room) {
 203                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 204                         if (skb2 == NULL) {
 205                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 206                                               IPSTATS_MIB_OUTDISCARDS);
 207                                 kfree_skb(skb);
 208                                 return -ENOBUFS;
 209                         }
 210                         consume_skb(skb);
 211                         skb = skb2;
 212                         skb_set_owner_w(skb, sk);
 213                 }
 214                 if (opt->opt_flen)
 215                         ipv6_push_frag_opts(skb, opt, &proto);
 216                 if (opt->opt_nflen)
 217                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
 233
 234         hdr->payload_len = htons(seg_len);
 235         hdr->nexthdr = proto;
 236         hdr->hop_limit = hlimit;
 237
 238         hdr->saddr = fl6->saddr;
 239         hdr->daddr = *first_hop;
 240
 241         skb->priority = sk->sk_priority;
 242         skb->mark = sk->sk_mark;
 243
 244         mtu = dst_mtu(dst);
 245         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 246                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 247                               IPSTATS_MIB_OUT, skb->len);
 248                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 249                                dst->dev, dst_output);
 250         }
 251
 252         skb->dev = dst->dev;
 253         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 254         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 255         kfree_skb(skb);
 256         return -EMSGSIZE;
 257 }
 258
 259 EXPORT_SYMBOL(ip6_xmit);
 260
 261 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 262 {
 263         struct ip6_ra_chain *ra;
 264         struct sock *last = NULL;
 265
 266         read_lock(&ip6_ra_lock);
 267         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 268                 struct sock *sk = ra->sk;
 269                 if (sk && ra->sel == sel &&
 270                     (!sk->sk_bound_dev_if ||
 271                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 272                         if (last) {
 273                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 274                                 if (skb2)
 275                                         rawv6_rcv(last, skb2);
 276                         }
 277                         last = sk;
 278                 }
 279         }
 280
 281         if (last) {
 282                 rawv6_rcv(last, skb);
 283                 read_unlock(&ip6_ra_lock);
 284                 return 1;
 285         }
 286         read_unlock(&ip6_ra_lock);
 287         return 0;
 288 }
 289
 290 static int ip6_forward_proxy_check(struct sk_buff *skb)
 291 {
 292         struct ipv6hdr *hdr = ipv6_hdr(skb);
 293         u8 nexthdr = hdr->nexthdr;
 294         __be16 frag_off;
 295         int offset;
 296
 297         if (ipv6_ext_hdr(nexthdr)) {
 298                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 299                 if (offset < 0)
 300                         return 0;
 301         } else
 302                 offset = sizeof(struct ipv6hdr);
 303
 304         if (nexthdr == IPPROTO_ICMPV6) {
 305                 struct icmp6hdr *icmp6;
 306
 307                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 308                                          offset + 1 - skb->data)))
 309                         return 0;
 310
 311                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 312
 313                 switch (icmp6->icmp6_type) {
 314                 case NDISC_ROUTER_SOLICITATION:
 315                 case NDISC_ROUTER_ADVERTISEMENT:
 316                 case NDISC_NEIGHBOUR_SOLICITATION:
 317                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 318                 case NDISC_REDIRECT:
 319                         /* For reaction involving unicast neighbor discovery
 320                          * message destined to the proxied address, pass it to
 321                          * input function.
 322                          */
 323                         return 1;
 324                 default:
 325                         break;
 326                 }
 327         }
 328
 329         /*
 330          * The proxying router can't forward traffic sent to a link-local
 331          * address, so signal the sender and discard the packet. This
 332          * behavior is clarified by the MIPv6 specification.
 333          */
 334         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 335                 dst_link_failure(skb);
 336                 return -1;
 337         }
 338
 339         return 0;
 340 }
 341
 342 static inline int ip6_forward_finish(struct sk_buff *skb)
 343 {
 344         return dst_output(skb);
 345 }
 346
 347 int ip6_forward(struct sk_buff *skb)
 348 {
 349         struct dst_entry *dst = skb_dst(skb);
 350         struct ipv6hdr *hdr = ipv6_hdr(skb);
 351         struct inet6_skb_parm *opt = IP6CB(skb);
 352         struct net *net = dev_net(dst->dev);
 353         u32 mtu;
 354
 355         if (net->ipv6.devconf_all->forwarding == 0)
 356                 goto error;
 357
 358         if (skb_warn_if_lro(skb))
 359                 goto drop;
 360
 361         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 362                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 363                 goto drop;
 364         }
 365
 366         if (skb->pkt_type != PACKET_HOST)
 367                 goto drop;
 368
 369         skb_forward_csum(skb);
 370
 371         /*
 372          *      We DO NOT make any processing on
 373          *      RA packets, pushing them to user level AS IS
 374          *      without ane WARRANTY that application will be able
 375          *      to interpret them. The reason is that we
 376          *      cannot make anything clever here.
 377          *
 378          *      We are not end-node, so that if packet contains
 379          *      AH/ESP, we cannot make anything.
 380          *      Defragmentation also would be mistake, RA packets
 381          *      cannot be fragmented, because there is no warranty
 382          *      that different fragments will go along one path. --ANK
 383          */
 384         if (opt->ra) {
 385                 u8 *ptr = skb_network_header(skb) + opt->ra;
 386                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 387                         return 0;
 388         }
 389
 390         /*
 391          *      check and decrement ttl
 392          */
 393         if (hdr->hop_limit <= 1) {
 394                 /* Force OUTPUT device used as source address */
 395                 skb->dev = dst->dev;
 396                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 397                 IP6_INC_STATS_BH(net,
 398                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 399
 400                 kfree_skb(skb);
 401                 return -ETIMEDOUT;
 402         }
 403
 404         /* XXX: idev->cnf.proxy_ndp? */
 405         if (net->ipv6.devconf_all->proxy_ndp &&
 406             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 407                 int proxied = ip6_forward_proxy_check(skb);
 408                 if (proxied > 0)
 409                         return ip6_input(skb);
 410                 else if (proxied < 0) {
 411                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 412                                       IPSTATS_MIB_INDISCARDS);
 413                         goto drop;
 414                 }
 415         }
 416
 417         if (!xfrm6_route_forward(skb)) {
 418                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 419                 goto drop;
 420         }
 421         dst = skb_dst(skb);
 422
 423         /* IPv6 specs say nothing about it, but it is clear that we cannot
 424            send redirects to source routed frames.
 425            We don't send redirects to frames decapsulated from IPsec.
 426          */
 427         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 428                 struct in6_addr *target = NULL;
 429                 struct inet_peer *peer;
 430                 struct rt6_info *rt;
 431
 432                 /*
 433                  *      incoming and outgoing devices are the same
 434                  *      send a redirect.
 435                  */
 436
 437                 rt = (struct rt6_info *) dst;
 438                 if (rt->rt6i_flags & RTF_GATEWAY)
 439                         target = &rt->rt6i_gateway;
 440                 else
 441                         target = &hdr->daddr;
 442
 443                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 444
 445                 /* Limit redirects both by destination (here)
 446                    and by source (inside ndisc_send_redirect)
 447                  */
 448                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 449                         ndisc_send_redirect(skb, target);
 450                 if (peer)
 451                         inet_putpeer(peer);
 452         } else {
 453                 int addrtype = ipv6_addr_type(&hdr->saddr);
 454
 455                 /* This check is security critical. */
 456                 if (addrtype == IPV6_ADDR_ANY ||
 457                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 458                         goto error;
 459                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 460                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 461                                     ICMPV6_NOT_NEIGHBOUR, 0);
 462                         goto error;
 463                 }
 464         }
 465
 466         mtu = dst_mtu(dst);
 467         if (mtu < IPV6_MIN_MTU)
 468                 mtu = IPV6_MIN_MTU;
 469
 470         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 471             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 472                 /* Again, force OUTPUT device used as source address */
 473                 skb->dev = dst->dev;
 474                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 475                 IP6_INC_STATS_BH(net,
 476                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 477                 IP6_INC_STATS_BH(net,
 478                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 479                 kfree_skb(skb);
 480                 return -EMSGSIZE;
 481         }
 482
 483         if (skb_cow(skb, dst->dev->hard_header_len)) {
 484                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 485                 goto drop;
 486         }
 487
 488         hdr = ipv6_hdr(skb);
 489
 490         /* Mangling hops number delayed to point after skb COW */
 491
 492         hdr->hop_limit--;
 493
 494         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 495         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 496         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 497                        ip6_forward_finish);
 498
 499 error:
 500         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 501 drop:
 502         kfree_skb(skb);
 503         return -EINVAL;
 504 }
 505
 506 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 507 {
 508         to->pkt_type = from->pkt_type;
 509         to->priority = from->priority;
 510         to->protocol = from->protocol;
 511         skb_dst_drop(to);
 512         skb_dst_set(to, dst_clone(skb_dst(from)));
 513         to->dev = from->dev;
 514         to->mark = from->mark;
 515
 516 #ifdef CONFIG_NET_SCHED
 517         to->tc_index = from->tc_index;
 518 #endif
 519         nf_copy(to, from);
 520 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 521         to->nf_trace = from->nf_trace;
 522 #endif
 523         skb_copy_secmark(to, from);
 524 }
 525
 526 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 527 {
 528         struct sk_buff *frag;
 529         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 530         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 531         struct ipv6hdr *tmp_hdr;
 532         struct frag_hdr *fh;
 533         unsigned int mtu, hlen, left, len;
 534         int hroom, troom;
 535         __be32 frag_id = 0;
 536         int ptr, offset = 0, err=0;
 537         u8 *prevhdr, nexthdr = 0;
 538         struct net *net = dev_net(skb_dst(skb)->dev);
 539
 540         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 541         nexthdr = *prevhdr;
 542
 543         mtu = ip6_skb_dst_mtu(skb);
 544
 545         /* We must not fragment if the socket is set to force MTU discovery
 546          * or if the skb it not generated by a local socket.
 547          */
 548         if (unlikely(!skb->local_df && skb->len > mtu) ||
 549                      (IP6CB(skb)->frag_max_size &&
 550                       IP6CB(skb)->frag_max_size > mtu)) {
 551                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 552                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 553
 554                 skb->dev = skb_dst(skb)->dev;
 555                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 556                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 557                               IPSTATS_MIB_FRAGFAILS);
 558                 kfree_skb(skb);
 559                 return -EMSGSIZE;
 560         }
 561
 562         if (np && np->frag_size < mtu) {
 563                 if (np->frag_size)
 564                         mtu = np->frag_size;
 565         }
 566         mtu -= hlen + sizeof(struct frag_hdr);
 567
 568         if (skb_has_frag_list(skb)) {
 569                 int first_len = skb_pagelen(skb);
 570                 struct sk_buff *frag2;
 571
 572                 if (first_len - hlen > mtu ||
 573                     ((first_len - hlen) & 7) ||
 574                     skb_cloned(skb))
 575                         goto slow_path;
 576
 577                 skb_walk_frags(skb, frag) {
 578                         /* Correct geometry. */
 579                         if (frag->len > mtu ||
 580                             ((frag->len & 7) && frag->next) ||
 581                             skb_headroom(frag) < hlen)
 582                                 goto slow_path_clean;
 583
 584                         /* Partially cloned skb? */
 585                         if (skb_shared(frag))
 586                                 goto slow_path_clean;
 587
 588                         BUG_ON(frag->sk);
 589                         if (skb->sk) {
 590                                 frag->sk = skb->sk;
 591                                 frag->destructor = sock_wfree;
 592                         }
 593                         skb->truesize -= frag->truesize;
 594                 }
 595
 596                 err = 0;
 597                 offset = 0;
 598                 frag = skb_shinfo(skb)->frag_list;
 599                 skb_frag_list_init(skb);
 600                 /* BUILD HEADER */
 601
 602                 *prevhdr = NEXTHDR_FRAGMENT;
 603                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 604                 if (!tmp_hdr) {
 605                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 606                                       IPSTATS_MIB_FRAGFAILS);
 607                         return -ENOMEM;
 608                 }
 609
 610                 __skb_pull(skb, hlen);
 611                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 612                 __skb_push(skb, hlen);
 613                 skb_reset_network_header(skb);
 614                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 615
 616                 ipv6_select_ident(fh, rt);
 617                 fh->nexthdr = nexthdr;
 618                 fh->reserved = 0;
 619                 fh->frag_off = htons(IP6_MF);
 620                 frag_id = fh->identification;
 621
 622                 first_len = skb_pagelen(skb);
 623                 skb->data_len = first_len - skb_headlen(skb);
 624                 skb->len = first_len;
 625                 ipv6_hdr(skb)->payload_len = htons(first_len -
 626                                                    sizeof(struct ipv6hdr));
 627
 628                 dst_hold(&rt->dst);
 629
 630                 for (;;) {
 631                         /* Prepare header of the next frame,
 632                          * before previous one went down. */
 633                         if (frag) {
 634                                 frag->ip_summed = CHECKSUM_NONE;
 635                                 skb_reset_transport_header(frag);
 636                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 637                                 __skb_push(frag, hlen);
 638                                 skb_reset_network_header(frag);
 639                                 memcpy(skb_network_header(frag), tmp_hdr,
 640                                        hlen);
 641                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 642                                 fh->nexthdr = nexthdr;
 643                                 fh->reserved = 0;
 644                                 fh->frag_off = htons(offset);
 645                                 if (frag->next != NULL)
 646                                         fh->frag_off |= htons(IP6_MF);
 647                                 fh->identification = frag_id;
 648                                 ipv6_hdr(frag)->payload_len =
 649                                                 htons(frag->len -
 650                                                       sizeof(struct ipv6hdr));
 651                                 ip6_copy_metadata(frag, skb);
 652                         }
 653
 654                         err = output(skb);
 655                         if(!err)
 656                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 657                                               IPSTATS_MIB_FRAGCREATES);
 658
 659                         if (err || !frag)
 660                                 break;
 661
 662                         skb = frag;
 663                         frag = skb->next;
 664                         skb->next = NULL;
 665                 }
 666
 667                 kfree(tmp_hdr);
 668
 669                 if (err == 0) {
 670                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 671                                       IPSTATS_MIB_FRAGOKS);
 672                         ip6_rt_put(rt);
 673                         return 0;
 674                 }
 675
 676                 while (frag) {
 677                         skb = frag->next;
 678                         kfree_skb(frag);
 679                         frag = skb;
 680                 }
 681
 682                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 683                               IPSTATS_MIB_FRAGFAILS);
 684                 ip6_rt_put(rt);
 685                 return err;
 686
 687 slow_path_clean:
 688                 skb_walk_frags(skb, frag2) {
 689                         if (frag2 == frag)
 690                                 break;
 691                         frag2->sk = NULL;
 692                         frag2->destructor = NULL;
 693                         skb->truesize += frag2->truesize;
 694                 }
 695         }
 696
 697 slow_path:
 698         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 699             skb_checksum_help(skb))
 700                 goto fail;
 701
 702         left = skb->len - hlen;         /* Space per frame */
 703         ptr = hlen;                     /* Where to start from */
 704
 705         /*
 706          *      Fragment the datagram.
 707          */
 708
 709         *prevhdr = NEXTHDR_FRAGMENT;
 710         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 711         troom = rt->dst.dev->needed_tailroom;
 712
 713         /*
 714          *      Keep copying data until we run out.
 715          */
 716         while(left > 0) {
 717                 len = left;
 718                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 719                 if (len > mtu)
 720                         len = mtu;
 721                 /* IF: we are not sending up to and including the packet end
 722                    then align the next start on an eight byte boundary */
 723                 if (len < left) {
 724                         len &= ~7;
 725                 }
 726                 /*
 727                  *      Allocate buffer.
 728                  */
 729
 730                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 731                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 732                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 733                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 734                                       IPSTATS_MIB_FRAGFAILS);
 735                         err = -ENOMEM;
 736                         goto fail;
 737                 }
 738
 739                 /*
 740                  *      Set up data on packet
 741                  */
 742
 743                 ip6_copy_metadata(frag, skb);
 744                 skb_reserve(frag, hroom);
 745                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 746                 skb_reset_network_header(frag);
 747                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 748                 frag->transport_header = (frag->network_header + hlen +
 749                                           sizeof(struct frag_hdr));
 750
 751                 /*
 752                  *      Charge the memory for the fragment to any owner
 753                  *      it might possess
 754                  */
 755                 if (skb->sk)
 756                         skb_set_owner_w(frag, skb->sk);
 757
 758                 /*
 759                  *      Copy the packet header into the new buffer.
 760                  */
 761                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 762
 763                 /*
 764                  *      Build fragment header.
 765                  */
 766                 fh->nexthdr = nexthdr;
 767                 fh->reserved = 0;
 768                 if (!frag_id) {
 769                         ipv6_select_ident(fh, rt);
 770                         frag_id = fh->identification;
 771                 } else
 772                         fh->identification = frag_id;
 773
 774                 /*
 775                  *      Copy a block of the IP datagram.
 776                  */
 777                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 778                         BUG();
 779                 left -= len;
 780
 781                 fh->frag_off = htons(offset);
 782                 if (left > 0)
 783                         fh->frag_off |= htons(IP6_MF);
 784                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 785                                                     sizeof(struct ipv6hdr));
 786
 787                 ptr += len;
 788                 offset += len;
 789
 790                 /*
 791                  *      Put this fragment into the sending queue.
 792                  */
 793                 err = output(frag);
 794                 if (err)
 795                         goto fail;
 796
 797                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 798                               IPSTATS_MIB_FRAGCREATES);
 799         }
 800         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 801                       IPSTATS_MIB_FRAGOKS);
 802         consume_skb(skb);
 803         return err;
 804
 805 fail:
 806         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 807                       IPSTATS_MIB_FRAGFAILS);
 808         kfree_skb(skb);
 809         return err;
 810 }
 811
 812 static inline int ip6_rt_check(const struct rt6key *rt_key,
 813                                const struct in6_addr *fl_addr,
 814                                const struct in6_addr *addr_cache)
 815 {
 816         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 817                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 818 }
 819
 820 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 821                                           struct dst_entry *dst,
 822                                           const struct flowi6 *fl6)
 823 {
 824         struct ipv6_pinfo *np = inet6_sk(sk);
 825         struct rt6_info *rt = (struct rt6_info *)dst;
 826
 827         if (!dst)
 828                 goto out;
 829
 830         /* Yes, checking route validity in not connected
 831          * case is not very simple. Take into account,
 832          * that we do not support routing by source, TOS,
 833          * and MSG_DONTROUTE            --ANK (980726)
 834          *
 835          * 1. ip6_rt_check(): If route was host route,
 836          *    check that cached destination is current.
 837          *    If it is network route, we still may
 838          *    check its validity using saved pointer
 839          *    to the last used address: daddr_cache.
 840          *    We do not want to save whole address now,
 841          *    (because main consumer of this service
 842          *    is tcp, which has not this problem),
 843          *    so that the last trick works only on connected
 844          *    sockets.
 845          * 2. oif also should be the same.
 846          */
 847         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 848 #ifdef CONFIG_IPV6_SUBTREES
 849             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 850 #endif
 851             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 852                 dst_release(dst);
 853                 dst = NULL;
 854         }
 855
 856 out:
 857         return dst;
 858 }
 859
 860 static int ip6_dst_lookup_tail(struct sock *sk,
 861                                struct dst_entry **dst, struct flowi6 *fl6)
 862 {
 863         struct net *net = sock_net(sk);
 864 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 865         struct neighbour *n;
 866         struct rt6_info *rt;
 867 #endif
 868         int err;
 869
 870         if (*dst == NULL)
 871                 *dst = ip6_route_output(net, sk, fl6);
 872
 873         if ((err = (*dst)->error))
 874                 goto out_err_release;
 875
 876         if (ipv6_addr_any(&fl6->saddr)) {
 877                 struct rt6_info *rt = (struct rt6_info *) *dst;
 878                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 879                                           sk ? inet6_sk(sk)->srcprefs : 0,
 880                                           &fl6->saddr);
 881                 if (err)
 882                         goto out_err_release;
 883         }
 884
 885 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 886         /*
 887          * Here if the dst entry we've looked up
 888          * has a neighbour entry that is in the INCOMPLETE
 889          * state and the src address from the flow is
 890          * marked as OPTIMISTIC, we release the found
 891          * dst entry and replace it instead with the
 892          * dst entry of the nexthop router
 893          */
 894         rt = (struct rt6_info *) *dst;
 895         rcu_read_lock_bh();
 896         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
 897         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 898         rcu_read_unlock_bh();
 899
 900         if (err) {
 901                 struct inet6_ifaddr *ifp;
 902                 struct flowi6 fl_gw6;
 903                 int redirect;
 904
 905                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 906                                       (*dst)->dev, 1);
 907
 908                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 909                 if (ifp)
 910                         in6_ifa_put(ifp);
 911
 912                 if (redirect) {
 913                         /*
 914                          * We need to get the dst entry for the
 915                          * default router instead
 916                          */
 917                         dst_release(*dst);
 918                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 919                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 920                         *dst = ip6_route_output(net, sk, &fl_gw6);
 921                         if ((err = (*dst)->error))
 922                                 goto out_err_release;
 923                 }
 924         }
 925 #endif
 926
 927         return 0;
 928
 929 out_err_release:
 930         if (err == -ENETUNREACH)
 931                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 932         dst_release(*dst);
 933         *dst = NULL;
 934         return err;
 935 }
 936
 937 /**
 938  *      ip6_dst_lookup - perform route lookup on flow
 939  *      @sk: socket which provides route info
 940  *      @dst: pointer to dst_entry * for result
 941  *      @fl6: flow to lookup
 942  *
 943  *      This function performs a route lookup on the given flow.
 944  *
 945  *      It returns zero on success, or a standard errno code on error.
 946  */
 947 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 948 {
 949         *dst = NULL;
 950         return ip6_dst_lookup_tail(sk, dst, fl6);
 951 }
 952 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 953
 954 /**
 955  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 956  *      @sk: socket which provides route info
 957  *      @fl6: flow to lookup
 958  *      @final_dst: final destination address for ipsec lookup
 959  *      @can_sleep: we are in a sleepable context
 960  *
 961  *      This function performs a route lookup on the given flow.
 962  *
 963  *      It returns a valid dst pointer on success, or a pointer encoded
 964  *      error code.
 965  */
 966 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 967                                       const struct in6_addr *final_dst,
 968                                       bool can_sleep)
 969 {
 970         struct dst_entry *dst = NULL;
 971         int err;
 972
 973         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 974         if (err)
 975                 return ERR_PTR(err);
 976         if (final_dst)
 977                 fl6->daddr = *final_dst;
 978         if (can_sleep)
 979                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 980
 981         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 982 }
 983 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
 984
 985 /**
 986  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 987  *      @sk: socket which provides the dst cache and route info
 988  *      @fl6: flow to lookup
 989  *      @final_dst: final destination address for ipsec lookup
 990  *      @can_sleep: we are in a sleepable context
 991  *
 992  *      This function performs a route lookup on the given flow with the
 993  *      possibility of using the cached route in the socket if it is valid.
 994  *      It will take the socket dst lock when operating on the dst cache.
 995  *      As a result, this function can only be used in process context.
 996  *
 997  *      It returns a valid dst pointer on success, or a pointer encoded
 998  *      error code.
 999  */
1000 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1001                                          const struct in6_addr *final_dst,
1002                                          bool can_sleep)
1003 {
1004         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1005         int err;
1006
1007         dst = ip6_sk_dst_check(sk, dst, fl6);
1008
1009         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1010         if (err)
1011                 return ERR_PTR(err);
1012         if (final_dst)
1013                 fl6->daddr = *final_dst;
1014         if (can_sleep)
1015                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1016
1017         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1018 }
1019 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1020
1021 static inline int ip6_ufo_append_data(struct sock *sk,
1022                         int getfrag(void *from, char *to, int offset, int len,
1023                         int odd, struct sk_buff *skb),
1024                         void *from, int length, int hh_len, int fragheaderlen,
1025                         int transhdrlen, int mtu,unsigned int flags,
1026                         struct rt6_info *rt)
1027
1028 {
1029         struct sk_buff *skb;
1030         int err;
1031
1032         /* There is support for UDP large send offload by network
1033          * device, so create one single skb packet containing complete
1034          * udp datagram
1035          */
1036         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1037                 skb = sock_alloc_send_skb(sk,
1038                         hh_len + fragheaderlen + transhdrlen + 20,
1039                         (flags & MSG_DONTWAIT), &err);
1040                 if (skb == NULL)
1041                         return err;
1042
1043                 /* reserve space for Hardware header */
1044                 skb_reserve(skb, hh_len);
1045
1046                 /* create space for UDP/IP header */
1047                 skb_put(skb,fragheaderlen + transhdrlen);
1048
1049                 /* initialize network header pointer */
1050                 skb_reset_network_header(skb);
1051
1052                 /* initialize protocol header pointer */
1053                 skb->transport_header = skb->network_header + fragheaderlen;
1054
1055                 skb->ip_summed = CHECKSUM_PARTIAL;
1056                 skb->csum = 0;
1057         }
1058
1059         err = skb_append_datato_frags(sk,skb, getfrag, from,
1060                                       (length - transhdrlen));
1061         if (!err) {
1062                 struct frag_hdr fhdr;
1063
1064                 /* Specify the length of each IPv6 datagram fragment.
1065                  * It has to be a multiple of 8.
1066                  */
1067                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1068                                              sizeof(struct frag_hdr)) & ~7;
1069                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1070                 ipv6_select_ident(&fhdr, rt);
1071                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1072                 __skb_queue_tail(&sk->sk_write_queue, skb);
1073
1074                 return 0;
1075         }
1076         /* There is not enough support do UPD LSO,
1077          * so follow normal path
1078          */
1079         kfree_skb(skb);
1080
1081         return err;
1082 }
1083
1084 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1085                                                gfp_t gfp)
1086 {
1087         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1088 }
1089
1090 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1091                                                 gfp_t gfp)
1092 {
1093         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1094 }
1095
1096 static void ip6_append_data_mtu(int *mtu,
1097                                 int *maxfraglen,
1098                                 unsigned int fragheaderlen,
1099                                 struct sk_buff *skb,
1100                                 struct rt6_info *rt)
1101 {
1102         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1103                 if (skb == NULL) {
1104                         /* first fragment, reserve header_len */
1105                         *mtu = *mtu - rt->dst.header_len;
1106
1107                 } else {
1108                         /*
1109                          * this fragment is not first, the headers
1110                          * space is regarded as data space.
1111                          */
1112                         *mtu = dst_mtu(rt->dst.path);
1113                 }
1114                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1115                               + fragheaderlen - sizeof(struct frag_hdr);
1116         }
1117 }
1118
1119 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1120         int offset, int len, int odd, struct sk_buff *skb),
1121         void *from, int length, int transhdrlen,
1122         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1123         struct rt6_info *rt, unsigned int flags, int dontfrag)
1124 {
1125         struct inet_sock *inet = inet_sk(sk);
1126         struct ipv6_pinfo *np = inet6_sk(sk);
1127         struct inet_cork *cork;
1128         struct sk_buff *skb, *skb_prev = NULL;
1129         unsigned int maxfraglen, fragheaderlen;
1130         int exthdrlen;
1131         int dst_exthdrlen;
1132         int hh_len;
1133         int mtu;
1134         int copy;
1135         int err;
1136         int offset = 0;
1137         __u8 tx_flags = 0;
1138
1139         if (flags&MSG_PROBE)
1140                 return 0;
1141         cork = &inet->cork.base;
1142         if (skb_queue_empty(&sk->sk_write_queue)) {
1143                 /*
1144                  * setup for corking
1145                  */
1146                 if (opt) {
1147                         if (WARN_ON(np->cork.opt))
1148                                 return -EINVAL;
1149
1150                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1151                         if (unlikely(np->cork.opt == NULL))
1152                                 return -ENOBUFS;
1153
1154                         np->cork.opt->tot_len = opt->tot_len;
1155                         np->cork.opt->opt_flen = opt->opt_flen;
1156                         np->cork.opt->opt_nflen = opt->opt_nflen;
1157
1158                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1159                                                             sk->sk_allocation);
1160                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1161                                 return -ENOBUFS;
1162
1163                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1164                                                             sk->sk_allocation);
1165                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1166                                 return -ENOBUFS;
1167
1168                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1169                                                            sk->sk_allocation);
1170                         if (opt->hopopt && !np->cork.opt->hopopt)
1171                                 return -ENOBUFS;
1172
1173                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1174                                                             sk->sk_allocation);
1175                         if (opt->srcrt && !np->cork.opt->srcrt)
1176                                 return -ENOBUFS;
1177
1178                         /* need source address above miyazawa*/
1179                 }
1180                 dst_hold(&rt->dst);
1181                 cork->dst = &rt->dst;
1182                 inet->cork.fl.u.ip6 = *fl6;
1183                 np->cork.hop_limit = hlimit;
1184                 np->cork.tclass = tclass;
1185                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1186                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1187                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1188                 else
1189                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1190                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1191                 if (np->frag_size < mtu) {
1192                         if (np->frag_size)
1193                                 mtu = np->frag_size;
1194                 }
1195                 cork->fragsize = mtu;
1196                 if (dst_allfrag(rt->dst.path))
1197                         cork->flags |= IPCORK_ALLFRAG;
1198                 cork->length = 0;
1199                 exthdrlen = (opt ? opt->opt_flen : 0);
1200                 length += exthdrlen;
1201                 transhdrlen += exthdrlen;
1202                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1203         } else {
1204                 rt = (struct rt6_info *)cork->dst;
1205                 fl6 = &inet->cork.fl.u.ip6;
1206                 opt = np->cork.opt;
1207                 transhdrlen = 0;
1208                 exthdrlen = 0;
1209                 dst_exthdrlen = 0;
1210                 mtu = cork->fragsize;
1211         }
1212
1213         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1214
1215         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1216                         (opt ? opt->opt_nflen : 0);
1217         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1218
1219         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1220                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1221                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1222                         return -EMSGSIZE;
1223                 }
1224         }
1225
1226         /* For UDP, check if TX timestamp is enabled */
1227         if (sk->sk_type == SOCK_DGRAM) {
1228                 err = sock_tx_timestamp(sk, &tx_flags);
1229                 if (err)
1230                         goto error;
1231         }
1232
1233         /*
1234          * Let's try using as much space as possible.
1235          * Use MTU if total length of the message fits into the MTU.
1236          * Otherwise, we need to reserve fragment header and
1237          * fragment alignment (= 8-15 octects, in total).
1238          *
1239          * Note that we may need to "move" the data from the tail of
1240          * of the buffer to the new fragment when we split
1241          * the message.
1242          *
1243          * FIXME: It may be fragmented into multiple chunks
1244          *        at once if non-fragmentable extension headers
1245          *        are too large.
1246          * --yoshfuji
1247          */
1248
1249         cork->length += length;
1250         if (length > mtu) {
1251                 int proto = sk->sk_protocol;
1252                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1253                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1254                         return -EMSGSIZE;
1255                 }
1256
1257                 if (proto == IPPROTO_UDP &&
1258                     (rt->dst.dev->features & NETIF_F_UFO)) {
1259
1260                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1261                                                   hh_len, fragheaderlen,
1262                                                   transhdrlen, mtu, flags, rt);
1263                         if (err)
1264                                 goto error;
1265                         return 0;
1266                 }
1267         }
1268
1269         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1270                 goto alloc_new_skb;
1271
1272         while (length > 0) {
1273                 /* Check if the remaining data fits into current packet. */
1274                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1275                 if (copy < length)
1276                         copy = maxfraglen - skb->len;
1277
1278                 if (copy <= 0) {
1279                         char *data;
1280                         unsigned int datalen;
1281                         unsigned int fraglen;
1282                         unsigned int fraggap;
1283                         unsigned int alloclen;
1284 alloc_new_skb:
1285                         /* There's no room in the current skb */
1286                         if (skb)
1287                                 fraggap = skb->len - maxfraglen;
1288                         else
1289                                 fraggap = 0;
1290                         /* update mtu and maxfraglen if necessary */
1291                         if (skb == NULL || skb_prev == NULL)
1292                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1293                                                     fragheaderlen, skb, rt);
1294
1295                         skb_prev = skb;
1296
1297                         /*
1298                          * If remaining data exceeds the mtu,
1299                          * we know we need more fragment(s).
1300                          */
1301                         datalen = length + fraggap;
1302
1303                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1304                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1305                         if ((flags & MSG_MORE) &&
1306                             !(rt->dst.dev->features&NETIF_F_SG))
1307                                 alloclen = mtu;
1308                         else
1309                                 alloclen = datalen + fragheaderlen;
1310
1311                         alloclen += dst_exthdrlen;
1312
1313                         if (datalen != length + fraggap) {
1314                                 /*
1315                                  * this is not the last fragment, the trailer
1316                                  * space is regarded as data space.
1317                                  */
1318                                 datalen += rt->dst.trailer_len;
1319                         }
1320
1321                         alloclen += rt->dst.trailer_len;
1322                         fraglen = datalen + fragheaderlen;
1323
1324                         /*
1325                          * We just reserve space for fragment header.
1326                          * Note: this may be overallocation if the message
1327                          * (without MSG_MORE) fits into the MTU.
1328                          */
1329                         alloclen += sizeof(struct frag_hdr);
1330
1331                         if (transhdrlen) {
1332                                 skb = sock_alloc_send_skb(sk,
1333                                                 alloclen + hh_len,
1334                                                 (flags & MSG_DONTWAIT), &err);
1335                         } else {
1336                                 skb = NULL;
1337                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1338                                     2 * sk->sk_sndbuf)
1339                                         skb = sock_wmalloc(sk,
1340                                                            alloclen + hh_len, 1,
1341                                                            sk->sk_allocation);
1342                                 if (unlikely(skb == NULL))
1343                                         err = -ENOBUFS;
1344                                 else {
1345                                         /* Only the initial fragment
1346                                          * is time stamped.
1347                                          */
1348                                         tx_flags = 0;
1349                                 }
1350                         }
1351                         if (skb == NULL)
1352                                 goto error;
1353                         /*
1354                          *      Fill in the control structures
1355                          */
1356                         skb->ip_summed = CHECKSUM_NONE;
1357                         skb->csum = 0;
1358                         /* reserve for fragmentation and ipsec header */
1359                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1360                                     dst_exthdrlen);
1361
1362                         if (sk->sk_type == SOCK_DGRAM)
1363                                 skb_shinfo(skb)->tx_flags = tx_flags;
1364
1365                         /*
1366                          *      Find where to start putting bytes
1367                          */
1368                         data = skb_put(skb, fraglen);
1369                         skb_set_network_header(skb, exthdrlen);
1370                         data += fragheaderlen;
1371                         skb->transport_header = (skb->network_header +
1372                                                  fragheaderlen);
1373                         if (fraggap) {
1374                                 skb->csum = skb_copy_and_csum_bits(
1375                                         skb_prev, maxfraglen,
1376                                         data + transhdrlen, fraggap, 0);
1377                                 skb_prev->csum = csum_sub(skb_prev->csum,
1378                                                           skb->csum);
1379                                 data += fraggap;
1380                                 pskb_trim_unique(skb_prev, maxfraglen);
1381                         }
1382                         copy = datalen - transhdrlen - fraggap;
1383
1384                         if (copy < 0) {
1385                                 err = -EINVAL;
1386                                 kfree_skb(skb);
1387                                 goto error;
1388                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1389                                 err = -EFAULT;
1390                                 kfree_skb(skb);
1391                                 goto error;
1392                         }
1393
1394                         offset += copy;
1395                         length -= datalen - fraggap;
1396                         transhdrlen = 0;
1397                         exthdrlen = 0;
1398                         dst_exthdrlen = 0;
1399
1400                         /*
1401                          * Put the packet on the pending queue
1402                          */
1403                         __skb_queue_tail(&sk->sk_write_queue, skb);
1404                         continue;
1405                 }
1406
1407                 if (copy > length)
1408                         copy = length;
1409
1410                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1411                         unsigned int off;
1412
1413                         off = skb->len;
1414                         if (getfrag(from, skb_put(skb, copy),
1415                                                 offset, copy, off, skb) < 0) {
1416                                 __skb_trim(skb, off);
1417                                 err = -EFAULT;
1418                                 goto error;
1419                         }
1420                 } else {
1421                         int i = skb_shinfo(skb)->nr_frags;
1422                         struct page_frag *pfrag = sk_page_frag(sk);
1423
1424                         err = -ENOMEM;
1425                         if (!sk_page_frag_refill(sk, pfrag))
1426                                 goto error;
1427
1428                         if (!skb_can_coalesce(skb, i, pfrag->page,
1429                                               pfrag->offset)) {
1430                                 err = -EMSGSIZE;
1431                                 if (i == MAX_SKB_FRAGS)
1432                                         goto error;
1433
1434                                 __skb_fill_page_desc(skb, i, pfrag->page,
1435                                                      pfrag->offset, 0);
1436                                 skb_shinfo(skb)->nr_frags = ++i;
1437                                 get_page(pfrag->page);
1438                         }
1439                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1440                         if (getfrag(from,
1441                                     page_address(pfrag->page) + pfrag->offset,
1442                                     offset, copy, skb->len, skb) < 0)
1443                                 goto error_efault;
1444
1445                         pfrag->offset += copy;
1446                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1447                         skb->len += copy;
1448                         skb->data_len += copy;
1449                         skb->truesize += copy;
1450                         atomic_add(copy, &sk->sk_wmem_alloc);
1451                 }
1452                 offset += copy;
1453                 length -= copy;
1454         }
1455
1456         return 0;
1457
1458 error_efault:
1459         err = -EFAULT;
1460 error:
1461         cork->length -= length;
1462         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1463         return err;
1464 }
1465 EXPORT_SYMBOL_GPL(ip6_append_data);
1466
1467 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1468 {
1469         if (np->cork.opt) {
1470                 kfree(np->cork.opt->dst0opt);
1471                 kfree(np->cork.opt->dst1opt);
1472                 kfree(np->cork.opt->hopopt);
1473                 kfree(np->cork.opt->srcrt);
1474                 kfree(np->cork.opt);
1475                 np->cork.opt = NULL;
1476         }
1477
1478         if (inet->cork.base.dst) {
1479                 dst_release(inet->cork.base.dst);
1480                 inet->cork.base.dst = NULL;
1481                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1482         }
1483         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1484 }
1485
1486 int ip6_push_pending_frames(struct sock *sk)
1487 {
1488         struct sk_buff *skb, *tmp_skb;
1489         struct sk_buff **tail_skb;
1490         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1491         struct inet_sock *inet = inet_sk(sk);
1492         struct ipv6_pinfo *np = inet6_sk(sk);
1493         struct net *net = sock_net(sk);
1494         struct ipv6hdr *hdr;
1495         struct ipv6_txoptions *opt = np->cork.opt;
1496         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1497         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1498         unsigned char proto = fl6->flowi6_proto;
1499         int err = 0;
1500
1501         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1502                 goto out;
1503         tail_skb = &(skb_shinfo(skb)->frag_list);
1504
1505         /* move skb->data to ip header from ext header */
1506         if (skb->data < skb_network_header(skb))
1507                 __skb_pull(skb, skb_network_offset(skb));
1508         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1509                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1510                 *tail_skb = tmp_skb;
1511                 tail_skb = &(tmp_skb->next);
1512                 skb->len += tmp_skb->len;
1513                 skb->data_len += tmp_skb->len;
1514                 skb->truesize += tmp_skb->truesize;
1515                 tmp_skb->destructor = NULL;
1516                 tmp_skb->sk = NULL;
1517         }
1518
1519         /* Allow local fragmentation. */
1520         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1521                 skb->local_df = 1;
1522
1523         *final_dst = fl6->daddr;
1524         __skb_pull(skb, skb_network_header_len(skb));
1525         if (opt && opt->opt_flen)
1526                 ipv6_push_frag_opts(skb, opt, &proto);
1527         if (opt && opt->opt_nflen)
1528                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1529
1530         skb_push(skb, sizeof(struct ipv6hdr));
1531         skb_reset_network_header(skb);
1532         hdr = ipv6_hdr(skb);
1533
1534         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1535         hdr->hop_limit = np->cork.hop_limit;
1536         hdr->nexthdr = proto;
1537         hdr->saddr = fl6->saddr;
1538         hdr->daddr = *final_dst;
1539
1540         skb->priority = sk->sk_priority;
1541         skb->mark = sk->sk_mark;
1542
1543         skb_dst_set(skb, dst_clone(&rt->dst));
1544         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1545         if (proto == IPPROTO_ICMPV6) {
1546                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1547
1548                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1549                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1550         }
1551
1552         err = ip6_local_out(skb);
1553         if (err) {
1554                 if (err > 0)
1555                         err = net_xmit_errno(err);
1556                 if (err)
1557                         goto error;
1558         }
1559
1560 out:
1561         ip6_cork_release(inet, np);
1562         return err;
1563 error:
1564         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1565         goto out;
1566 }
1567 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1568
1569 void ip6_flush_pending_frames(struct sock *sk)
1570 {
1571         struct sk_buff *skb;
1572
1573         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1574                 if (skb_dst(skb))
1575                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1576                                       IPSTATS_MIB_OUTDISCARDS);
1577                 kfree_skb(skb);
1578         }
1579
1580         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1581 }
1582 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);