net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int __ip6_local_out(struct sk_buff *skb)
  60 {
  61         int len;
  62
  63         len = skb->len - sizeof(struct ipv6hdr);
  64         if (len > IPV6_MAXPLEN)
  65                 len = 0;
  66         ipv6_hdr(skb)->payload_len = htons(len);
  67
  68         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  69                        skb_dst(skb)->dev, dst_output);
  70 }
  71
  72 int ip6_local_out(struct sk_buff *skb)
  73 {
  74         int err;
  75
  76         err = __ip6_local_out(skb);
  77         if (likely(err == 1))
  78                 err = dst_output(skb);
  79
  80         return err;
  81 }
  82 EXPORT_SYMBOL_GPL(ip6_local_out);
  83
  84 static int ip6_finish_output2(struct sk_buff *skb)
  85 {
  86         struct dst_entry *dst = skb_dst(skb);
  87         struct net_device *dev = dst->dev;
  88         struct neighbour *neigh;
  89         struct in6_addr *nexthop;
  90         int ret;
  91
  92         skb->protocol = htons(ETH_P_IPV6);
  93         skb->dev = dev;
  94
  95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  99                     ((mroute6_socket(dev_net(dev), skb) &&
 100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                          &ipv6_hdr(skb)->saddr))) {
 103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                         /* Do not check for IFF_ALLMULTI; multicast routing
 106                            is not supported in any case.
 107                          */
 108                         if (newskb)
 109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                         newskb, NULL, newskb->dev,
 111                                         dev_loopback_xmit);
 112
 113                         if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                 IP6_INC_STATS(dev_net(dev), idev,
 115                                               IPSTATS_MIB_OUTDISCARDS);
 116                                 kfree_skb(skb);
 117                                 return 0;
 118                         }
 119                 }
 120
 121                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 122                                 skb->len);
 123
 124                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 125                     IPV6_ADDR_SCOPE_NODELOCAL &&
 126                     !(dev->flags & IFF_LOOPBACK)) {
 127                         kfree_skb(skb);
 128                         return 0;
 129                 }
 130         }
 131
 132         rcu_read_lock_bh();
 133         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 134         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 135         if (unlikely(!neigh))
 136                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 137         if (!IS_ERR(neigh)) {
 138                 ret = dst_neigh_output(dst, neigh, skb);
 139                 rcu_read_unlock_bh();
 140                 return ret;
 141         }
 142         rcu_read_unlock_bh();
 143
 144         IP6_INC_STATS_BH(dev_net(dst->dev),
 145                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 146         kfree_skb(skb);
 147         return -EINVAL;
 148 }
 149
 150 static int ip6_finish_output(struct sk_buff *skb)
 151 {
 152         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 153             dst_allfrag(skb_dst(skb)))
 154                 return ip6_fragment(skb, ip6_finish_output2);
 155         else
 156                 return ip6_finish_output2(skb);
 157 }
 158
 159 int ip6_output(struct sk_buff *skb)
 160 {
 161         struct net_device *dev = skb_dst(skb)->dev;
 162         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 163         if (unlikely(idev->cnf.disable_ipv6)) {
 164                 IP6_INC_STATS(dev_net(dev), idev,
 165                               IPSTATS_MIB_OUTDISCARDS);
 166                 kfree_skb(skb);
 167                 return 0;
 168         }
 169
 170         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 171                             ip6_finish_output,
 172                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 173 }
 174
 175 /*
 176  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 177  */
 178
 179 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 180              struct ipv6_txoptions *opt, int tclass)
 181 {
 182         struct net *net = sock_net(sk);
 183         struct ipv6_pinfo *np = inet6_sk(sk);
 184         struct in6_addr *first_hop = &fl6->daddr;
 185         struct dst_entry *dst = skb_dst(skb);
 186         struct ipv6hdr *hdr;
 187         u8  proto = fl6->flowi6_proto;
 188         int seg_len = skb->len;
 189         int hlimit = -1;
 190         u32 mtu;
 191
 192         if (opt) {
 193                 unsigned int head_room;
 194
 195                 /* First: exthdrs may take lots of space (~8K for now)
 196                    MAX_HEADER is not enough.
 197                  */
 198                 head_room = opt->opt_nflen + opt->opt_flen;
 199                 seg_len += head_room;
 200                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 201
 202                 if (skb_headroom(skb) < head_room) {
 203                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 204                         if (skb2 == NULL) {
 205                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 206                                               IPSTATS_MIB_OUTDISCARDS);
 207                                 kfree_skb(skb);
 208                                 return -ENOBUFS;
 209                         }
 210                         consume_skb(skb);
 211                         skb = skb2;
 212                         skb_set_owner_w(skb, sk);
 213                 }
 214                 if (opt->opt_flen)
 215                         ipv6_push_frag_opts(skb, opt, &proto);
 216                 if (opt->opt_nflen)
 217                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 218         }
 219
 220         skb_push(skb, sizeof(struct ipv6hdr));
 221         skb_reset_network_header(skb);
 222         hdr = ipv6_hdr(skb);
 223
 224         /*
 225          *      Fill in the IPv6 header
 226          */
 227         if (np)
 228                 hlimit = np->hop_limit;
 229         if (hlimit < 0)
 230                 hlimit = ip6_dst_hoplimit(dst);
 231
 232         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
 233
 234         hdr->payload_len = htons(seg_len);
 235         hdr->nexthdr = proto;
 236         hdr->hop_limit = hlimit;
 237
 238         hdr->saddr = fl6->saddr;
 239         hdr->daddr = *first_hop;
 240
 241         skb->priority = sk->sk_priority;
 242         skb->mark = sk->sk_mark;
 243
 244         mtu = dst_mtu(dst);
 245         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 246                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 247                               IPSTATS_MIB_OUT, skb->len);
 248                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 249                                dst->dev, dst_output);
 250         }
 251
 252         skb->dev = dst->dev;
 253         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 254         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 255         kfree_skb(skb);
 256         return -EMSGSIZE;
 257 }
 258
 259 EXPORT_SYMBOL(ip6_xmit);
 260
 261 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 262 {
 263         struct ip6_ra_chain *ra;
 264         struct sock *last = NULL;
 265
 266         read_lock(&ip6_ra_lock);
 267         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 268                 struct sock *sk = ra->sk;
 269                 if (sk && ra->sel == sel &&
 270                     (!sk->sk_bound_dev_if ||
 271                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 272                         if (last) {
 273                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 274                                 if (skb2)
 275                                         rawv6_rcv(last, skb2);
 276                         }
 277                         last = sk;
 278                 }
 279         }
 280
 281         if (last) {
 282                 rawv6_rcv(last, skb);
 283                 read_unlock(&ip6_ra_lock);
 284                 return 1;
 285         }
 286         read_unlock(&ip6_ra_lock);
 287         return 0;
 288 }
 289
 290 static int ip6_forward_proxy_check(struct sk_buff *skb)
 291 {
 292         struct ipv6hdr *hdr = ipv6_hdr(skb);
 293         u8 nexthdr = hdr->nexthdr;
 294         __be16 frag_off;
 295         int offset;
 296
 297         if (ipv6_ext_hdr(nexthdr)) {
 298                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 299                 if (offset < 0)
 300                         return 0;
 301         } else
 302                 offset = sizeof(struct ipv6hdr);
 303
 304         if (nexthdr == IPPROTO_ICMPV6) {
 305                 struct icmp6hdr *icmp6;
 306
 307                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 308                                          offset + 1 - skb->data)))
 309                         return 0;
 310
 311                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 312
 313                 switch (icmp6->icmp6_type) {
 314                 case NDISC_ROUTER_SOLICITATION:
 315                 case NDISC_ROUTER_ADVERTISEMENT:
 316                 case NDISC_NEIGHBOUR_SOLICITATION:
 317                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 318                 case NDISC_REDIRECT:
 319                         /* For reaction involving unicast neighbor discovery
 320                          * message destined to the proxied address, pass it to
 321                          * input function.
 322                          */
 323                         return 1;
 324                 default:
 325                         break;
 326                 }
 327         }
 328
 329         /*
 330          * The proxying router can't forward traffic sent to a link-local
 331          * address, so signal the sender and discard the packet. This
 332          * behavior is clarified by the MIPv6 specification.
 333          */
 334         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 335                 dst_link_failure(skb);
 336                 return -1;
 337         }
 338
 339         return 0;
 340 }
 341
 342 static inline int ip6_forward_finish(struct sk_buff *skb)
 343 {
 344         return dst_output(skb);
 345 }
 346
 347 int ip6_forward(struct sk_buff *skb)
 348 {
 349         struct dst_entry *dst = skb_dst(skb);
 350         struct ipv6hdr *hdr = ipv6_hdr(skb);
 351         struct inet6_skb_parm *opt = IP6CB(skb);
 352         struct net *net = dev_net(dst->dev);
 353         u32 mtu;
 354
 355         if (net->ipv6.devconf_all->forwarding == 0)
 356                 goto error;
 357
 358         if (skb_warn_if_lro(skb))
 359                 goto drop;
 360
 361         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 362                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 363                 goto drop;
 364         }
 365
 366         if (skb->pkt_type != PACKET_HOST)
 367                 goto drop;
 368
 369         skb_forward_csum(skb);
 370
 371         /*
 372          *      We DO NOT make any processing on
 373          *      RA packets, pushing them to user level AS IS
 374          *      without ane WARRANTY that application will be able
 375          *      to interpret them. The reason is that we
 376          *      cannot make anything clever here.
 377          *
 378          *      We are not end-node, so that if packet contains
 379          *      AH/ESP, we cannot make anything.
 380          *      Defragmentation also would be mistake, RA packets
 381          *      cannot be fragmented, because there is no warranty
 382          *      that different fragments will go along one path. --ANK
 383          */
 384         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 385                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 386                         return 0;
 387         }
 388
 389         /*
 390          *      check and decrement ttl
 391          */
 392         if (hdr->hop_limit <= 1) {
 393                 /* Force OUTPUT device used as source address */
 394                 skb->dev = dst->dev;
 395                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 396                 IP6_INC_STATS_BH(net,
 397                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 398
 399                 kfree_skb(skb);
 400                 return -ETIMEDOUT;
 401         }
 402
 403         /* XXX: idev->cnf.proxy_ndp? */
 404         if (net->ipv6.devconf_all->proxy_ndp &&
 405             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 406                 int proxied = ip6_forward_proxy_check(skb);
 407                 if (proxied > 0)
 408                         return ip6_input(skb);
 409                 else if (proxied < 0) {
 410                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 411                                       IPSTATS_MIB_INDISCARDS);
 412                         goto drop;
 413                 }
 414         }
 415
 416         if (!xfrm6_route_forward(skb)) {
 417                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 418                 goto drop;
 419         }
 420         dst = skb_dst(skb);
 421
 422         /* IPv6 specs say nothing about it, but it is clear that we cannot
 423            send redirects to source routed frames.
 424            We don't send redirects to frames decapsulated from IPsec.
 425          */
 426         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 427                 struct in6_addr *target = NULL;
 428                 struct inet_peer *peer;
 429                 struct rt6_info *rt;
 430
 431                 /*
 432                  *      incoming and outgoing devices are the same
 433                  *      send a redirect.
 434                  */
 435
 436                 rt = (struct rt6_info *) dst;
 437                 if (rt->rt6i_flags & RTF_GATEWAY)
 438                         target = &rt->rt6i_gateway;
 439                 else
 440                         target = &hdr->daddr;
 441
 442                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 443
 444                 /* Limit redirects both by destination (here)
 445                    and by source (inside ndisc_send_redirect)
 446                  */
 447                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 448                         ndisc_send_redirect(skb, target);
 449                 if (peer)
 450                         inet_putpeer(peer);
 451         } else {
 452                 int addrtype = ipv6_addr_type(&hdr->saddr);
 453
 454                 /* This check is security critical. */
 455                 if (addrtype == IPV6_ADDR_ANY ||
 456                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 457                         goto error;
 458                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 459                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 460                                     ICMPV6_NOT_NEIGHBOUR, 0);
 461                         goto error;
 462                 }
 463         }
 464
 465         mtu = dst_mtu(dst);
 466         if (mtu < IPV6_MIN_MTU)
 467                 mtu = IPV6_MIN_MTU;
 468
 469         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 470             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 471                 /* Again, force OUTPUT device used as source address */
 472                 skb->dev = dst->dev;
 473                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 474                 IP6_INC_STATS_BH(net,
 475                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 476                 IP6_INC_STATS_BH(net,
 477                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 478                 kfree_skb(skb);
 479                 return -EMSGSIZE;
 480         }
 481
 482         if (skb_cow(skb, dst->dev->hard_header_len)) {
 483                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 484                 goto drop;
 485         }
 486
 487         hdr = ipv6_hdr(skb);
 488
 489         /* Mangling hops number delayed to point after skb COW */
 490
 491         hdr->hop_limit--;
 492
 493         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 494         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 495         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 496                        ip6_forward_finish);
 497
 498 error:
 499         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 500 drop:
 501         kfree_skb(skb);
 502         return -EINVAL;
 503 }
 504
 505 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 506 {
 507         to->pkt_type = from->pkt_type;
 508         to->priority = from->priority;
 509         to->protocol = from->protocol;
 510         skb_dst_drop(to);
 511         skb_dst_set(to, dst_clone(skb_dst(from)));
 512         to->dev = from->dev;
 513         to->mark = from->mark;
 514
 515 #ifdef CONFIG_NET_SCHED
 516         to->tc_index = from->tc_index;
 517 #endif
 518         nf_copy(to, from);
 519 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 520         to->nf_trace = from->nf_trace;
 521 #endif
 522         skb_copy_secmark(to, from);
 523 }
 524
 525 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 526 {
 527         struct sk_buff *frag;
 528         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 529         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 530         struct ipv6hdr *tmp_hdr;
 531         struct frag_hdr *fh;
 532         unsigned int mtu, hlen, left, len;
 533         int hroom, troom;
 534         __be32 frag_id = 0;
 535         int ptr, offset = 0, err=0;
 536         u8 *prevhdr, nexthdr = 0;
 537         struct net *net = dev_net(skb_dst(skb)->dev);
 538
 539         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 540         nexthdr = *prevhdr;
 541
 542         mtu = ip6_skb_dst_mtu(skb);
 543
 544         /* We must not fragment if the socket is set to force MTU discovery
 545          * or if the skb it not generated by a local socket.
 546          */
 547         if (unlikely(!skb->local_df && skb->len > mtu) ||
 548                      (IP6CB(skb)->frag_max_size &&
 549                       IP6CB(skb)->frag_max_size > mtu)) {
 550                 if (skb->sk && dst_allfrag(skb_dst(skb)))
 551                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 552
 553                 skb->dev = skb_dst(skb)->dev;
 554                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 555                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 556                               IPSTATS_MIB_FRAGFAILS);
 557                 kfree_skb(skb);
 558                 return -EMSGSIZE;
 559         }
 560
 561         if (np && np->frag_size < mtu) {
 562                 if (np->frag_size)
 563                         mtu = np->frag_size;
 564         }
 565         mtu -= hlen + sizeof(struct frag_hdr);
 566
 567         if (skb_has_frag_list(skb)) {
 568                 int first_len = skb_pagelen(skb);
 569                 struct sk_buff *frag2;
 570
 571                 if (first_len - hlen > mtu ||
 572                     ((first_len - hlen) & 7) ||
 573                     skb_cloned(skb))
 574                         goto slow_path;
 575
 576                 skb_walk_frags(skb, frag) {
 577                         /* Correct geometry. */
 578                         if (frag->len > mtu ||
 579                             ((frag->len & 7) && frag->next) ||
 580                             skb_headroom(frag) < hlen)
 581                                 goto slow_path_clean;
 582
 583                         /* Partially cloned skb? */
 584                         if (skb_shared(frag))
 585                                 goto slow_path_clean;
 586
 587                         BUG_ON(frag->sk);
 588                         if (skb->sk) {
 589                                 frag->sk = skb->sk;
 590                                 frag->destructor = sock_wfree;
 591                         }
 592                         skb->truesize -= frag->truesize;
 593                 }
 594
 595                 err = 0;
 596                 offset = 0;
 597                 frag = skb_shinfo(skb)->frag_list;
 598                 skb_frag_list_init(skb);
 599                 /* BUILD HEADER */
 600
 601                 *prevhdr = NEXTHDR_FRAGMENT;
 602                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 603                 if (!tmp_hdr) {
 604                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 605                                       IPSTATS_MIB_FRAGFAILS);
 606                         return -ENOMEM;
 607                 }
 608
 609                 __skb_pull(skb, hlen);
 610                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 611                 __skb_push(skb, hlen);
 612                 skb_reset_network_header(skb);
 613                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 614
 615                 ipv6_select_ident(fh, rt);
 616                 fh->nexthdr = nexthdr;
 617                 fh->reserved = 0;
 618                 fh->frag_off = htons(IP6_MF);
 619                 frag_id = fh->identification;
 620
 621                 first_len = skb_pagelen(skb);
 622                 skb->data_len = first_len - skb_headlen(skb);
 623                 skb->len = first_len;
 624                 ipv6_hdr(skb)->payload_len = htons(first_len -
 625                                                    sizeof(struct ipv6hdr));
 626
 627                 dst_hold(&rt->dst);
 628
 629                 for (;;) {
 630                         /* Prepare header of the next frame,
 631                          * before previous one went down. */
 632                         if (frag) {
 633                                 frag->ip_summed = CHECKSUM_NONE;
 634                                 skb_reset_transport_header(frag);
 635                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 636                                 __skb_push(frag, hlen);
 637                                 skb_reset_network_header(frag);
 638                                 memcpy(skb_network_header(frag), tmp_hdr,
 639                                        hlen);
 640                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 641                                 fh->nexthdr = nexthdr;
 642                                 fh->reserved = 0;
 643                                 fh->frag_off = htons(offset);
 644                                 if (frag->next != NULL)
 645                                         fh->frag_off |= htons(IP6_MF);
 646                                 fh->identification = frag_id;
 647                                 ipv6_hdr(frag)->payload_len =
 648                                                 htons(frag->len -
 649                                                       sizeof(struct ipv6hdr));
 650                                 ip6_copy_metadata(frag, skb);
 651                         }
 652
 653                         err = output(skb);
 654                         if(!err)
 655                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 656                                               IPSTATS_MIB_FRAGCREATES);
 657
 658                         if (err || !frag)
 659                                 break;
 660
 661                         skb = frag;
 662                         frag = skb->next;
 663                         skb->next = NULL;
 664                 }
 665
 666                 kfree(tmp_hdr);
 667
 668                 if (err == 0) {
 669                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 670                                       IPSTATS_MIB_FRAGOKS);
 671                         ip6_rt_put(rt);
 672                         return 0;
 673                 }
 674
 675                 while (frag) {
 676                         skb = frag->next;
 677                         kfree_skb(frag);
 678                         frag = skb;
 679                 }
 680
 681                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 682                               IPSTATS_MIB_FRAGFAILS);
 683                 ip6_rt_put(rt);
 684                 return err;
 685
 686 slow_path_clean:
 687                 skb_walk_frags(skb, frag2) {
 688                         if (frag2 == frag)
 689                                 break;
 690                         frag2->sk = NULL;
 691                         frag2->destructor = NULL;
 692                         skb->truesize += frag2->truesize;
 693                 }
 694         }
 695
 696 slow_path:
 697         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 698             skb_checksum_help(skb))
 699                 goto fail;
 700
 701         left = skb->len - hlen;         /* Space per frame */
 702         ptr = hlen;                     /* Where to start from */
 703
 704         /*
 705          *      Fragment the datagram.
 706          */
 707
 708         *prevhdr = NEXTHDR_FRAGMENT;
 709         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 710         troom = rt->dst.dev->needed_tailroom;
 711
 712         /*
 713          *      Keep copying data until we run out.
 714          */
 715         while(left > 0) {
 716                 len = left;
 717                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 718                 if (len > mtu)
 719                         len = mtu;
 720                 /* IF: we are not sending up to and including the packet end
 721                    then align the next start on an eight byte boundary */
 722                 if (len < left) {
 723                         len &= ~7;
 724                 }
 725                 /*
 726                  *      Allocate buffer.
 727                  */
 728
 729                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 730                                       hroom + troom, GFP_ATOMIC)) == NULL) {
 731                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 732                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 733                                       IPSTATS_MIB_FRAGFAILS);
 734                         err = -ENOMEM;
 735                         goto fail;
 736                 }
 737
 738                 /*
 739                  *      Set up data on packet
 740                  */
 741
 742                 ip6_copy_metadata(frag, skb);
 743                 skb_reserve(frag, hroom);
 744                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 745                 skb_reset_network_header(frag);
 746                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 747                 frag->transport_header = (frag->network_header + hlen +
 748                                           sizeof(struct frag_hdr));
 749
 750                 /*
 751                  *      Charge the memory for the fragment to any owner
 752                  *      it might possess
 753                  */
 754                 if (skb->sk)
 755                         skb_set_owner_w(frag, skb->sk);
 756
 757                 /*
 758                  *      Copy the packet header into the new buffer.
 759                  */
 760                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 761
 762                 /*
 763                  *      Build fragment header.
 764                  */
 765                 fh->nexthdr = nexthdr;
 766                 fh->reserved = 0;
 767                 if (!frag_id) {
 768                         ipv6_select_ident(fh, rt);
 769                         frag_id = fh->identification;
 770                 } else
 771                         fh->identification = frag_id;
 772
 773                 /*
 774                  *      Copy a block of the IP datagram.
 775                  */
 776                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 777                         BUG();
 778                 left -= len;
 779
 780                 fh->frag_off = htons(offset);
 781                 if (left > 0)
 782                         fh->frag_off |= htons(IP6_MF);
 783                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 784                                                     sizeof(struct ipv6hdr));
 785
 786                 ptr += len;
 787                 offset += len;
 788
 789                 /*
 790                  *      Put this fragment into the sending queue.
 791                  */
 792                 err = output(frag);
 793                 if (err)
 794                         goto fail;
 795
 796                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 797                               IPSTATS_MIB_FRAGCREATES);
 798         }
 799         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 800                       IPSTATS_MIB_FRAGOKS);
 801         consume_skb(skb);
 802         return err;
 803
 804 fail:
 805         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 806                       IPSTATS_MIB_FRAGFAILS);
 807         kfree_skb(skb);
 808         return err;
 809 }
 810
 811 static inline int ip6_rt_check(const struct rt6key *rt_key,
 812                                const struct in6_addr *fl_addr,
 813                                const struct in6_addr *addr_cache)
 814 {
 815         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 816                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 817 }
 818
 819 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 820                                           struct dst_entry *dst,
 821                                           const struct flowi6 *fl6)
 822 {
 823         struct ipv6_pinfo *np = inet6_sk(sk);
 824         struct rt6_info *rt;
 825
 826         if (!dst)
 827                 goto out;
 828
 829         if (dst->ops->family != AF_INET6) {
 830                 dst_release(dst);
 831                 return NULL;
 832         }
 833
 834         rt = (struct rt6_info *)dst;
 835         /* Yes, checking route validity in not connected
 836          * case is not very simple. Take into account,
 837          * that we do not support routing by source, TOS,
 838          * and MSG_DONTROUTE            --ANK (980726)
 839          *
 840          * 1. ip6_rt_check(): If route was host route,
 841          *    check that cached destination is current.
 842          *    If it is network route, we still may
 843          *    check its validity using saved pointer
 844          *    to the last used address: daddr_cache.
 845          *    We do not want to save whole address now,
 846          *    (because main consumer of this service
 847          *    is tcp, which has not this problem),
 848          *    so that the last trick works only on connected
 849          *    sockets.
 850          * 2. oif also should be the same.
 851          */
 852         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 853 #ifdef CONFIG_IPV6_SUBTREES
 854             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 855 #endif
 856             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 857                 dst_release(dst);
 858                 dst = NULL;
 859         }
 860
 861 out:
 862         return dst;
 863 }
 864
 865 static int ip6_dst_lookup_tail(struct sock *sk,
 866                                struct dst_entry **dst, struct flowi6 *fl6)
 867 {
 868         struct net *net = sock_net(sk);
 869 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 870         struct neighbour *n;
 871         struct rt6_info *rt;
 872 #endif
 873         int err;
 874
 875         if (*dst == NULL)
 876                 *dst = ip6_route_output(net, sk, fl6);
 877
 878         if ((err = (*dst)->error))
 879                 goto out_err_release;
 880
 881         if (ipv6_addr_any(&fl6->saddr)) {
 882                 struct rt6_info *rt = (struct rt6_info *) *dst;
 883                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 884                                           sk ? inet6_sk(sk)->srcprefs : 0,
 885                                           &fl6->saddr);
 886                 if (err)
 887                         goto out_err_release;
 888         }
 889
 890 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 891         /*
 892          * Here if the dst entry we've looked up
 893          * has a neighbour entry that is in the INCOMPLETE
 894          * state and the src address from the flow is
 895          * marked as OPTIMISTIC, we release the found
 896          * dst entry and replace it instead with the
 897          * dst entry of the nexthop router
 898          */
 899         rt = (struct rt6_info *) *dst;
 900         rcu_read_lock_bh();
 901         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
 902         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 903         rcu_read_unlock_bh();
 904
 905         if (err) {
 906                 struct inet6_ifaddr *ifp;
 907                 struct flowi6 fl_gw6;
 908                 int redirect;
 909
 910                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 911                                       (*dst)->dev, 1);
 912
 913                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 914                 if (ifp)
 915                         in6_ifa_put(ifp);
 916
 917                 if (redirect) {
 918                         /*
 919                          * We need to get the dst entry for the
 920                          * default router instead
 921                          */
 922                         dst_release(*dst);
 923                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 924                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 925                         *dst = ip6_route_output(net, sk, &fl_gw6);
 926                         if ((err = (*dst)->error))
 927                                 goto out_err_release;
 928                 }
 929         }
 930 #endif
 931
 932         return 0;
 933
 934 out_err_release:
 935         if (err == -ENETUNREACH)
 936                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 937         dst_release(*dst);
 938         *dst = NULL;
 939         return err;
 940 }
 941
 942 /**
 943  *      ip6_dst_lookup - perform route lookup on flow
 944  *      @sk: socket which provides route info
 945  *      @dst: pointer to dst_entry * for result
 946  *      @fl6: flow to lookup
 947  *
 948  *      This function performs a route lookup on the given flow.
 949  *
 950  *      It returns zero on success, or a standard errno code on error.
 951  */
 952 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 953 {
 954         *dst = NULL;
 955         return ip6_dst_lookup_tail(sk, dst, fl6);
 956 }
 957 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 958
 959 /**
 960  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 961  *      @sk: socket which provides route info
 962  *      @fl6: flow to lookup
 963  *      @final_dst: final destination address for ipsec lookup
 964  *      @can_sleep: we are in a sleepable context
 965  *
 966  *      This function performs a route lookup on the given flow.
 967  *
 968  *      It returns a valid dst pointer on success, or a pointer encoded
 969  *      error code.
 970  */
 971 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 972                                       const struct in6_addr *final_dst,
 973                                       bool can_sleep)
 974 {
 975         struct dst_entry *dst = NULL;
 976         int err;
 977
 978         err = ip6_dst_lookup_tail(sk, &dst, fl6);
 979         if (err)
 980                 return ERR_PTR(err);
 981         if (final_dst)
 982                 fl6->daddr = *final_dst;
 983         if (can_sleep)
 984                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 985
 986         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 987 }
 988 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
 989
 990 /**
 991  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 992  *      @sk: socket which provides the dst cache and route info
 993  *      @fl6: flow to lookup
 994  *      @final_dst: final destination address for ipsec lookup
 995  *      @can_sleep: we are in a sleepable context
 996  *
 997  *      This function performs a route lookup on the given flow with the
 998  *      possibility of using the cached route in the socket if it is valid.
 999  *      It will take the socket dst lock when operating on the dst cache.
1000  *      As a result, this function can only be used in process context.
1001  *
1002  *      It returns a valid dst pointer on success, or a pointer encoded
1003  *      error code.
1004  */
1005 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1006                                          const struct in6_addr *final_dst,
1007                                          bool can_sleep)
1008 {
1009         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1010         int err;
1011
1012         dst = ip6_sk_dst_check(sk, dst, fl6);
1013
1014         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1015         if (err)
1016                 return ERR_PTR(err);
1017         if (final_dst)
1018                 fl6->daddr = *final_dst;
1019         if (can_sleep)
1020                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1021
1022         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1025
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027                         int getfrag(void *from, char *to, int offset, int len,
1028                         int odd, struct sk_buff *skb),
1029                         void *from, int length, int hh_len, int fragheaderlen,
1030                         int transhdrlen, int mtu,unsigned int flags,
1031                         struct rt6_info *rt)
1032
1033 {
1034         struct sk_buff *skb;
1035         int err;
1036
1037         /* There is support for UDP large send offload by network
1038          * device, so create one single skb packet containing complete
1039          * udp datagram
1040          */
1041         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1042                 skb = sock_alloc_send_skb(sk,
1043                         hh_len + fragheaderlen + transhdrlen + 20,
1044                         (flags & MSG_DONTWAIT), &err);
1045                 if (skb == NULL)
1046                         return err;
1047
1048                 /* reserve space for Hardware header */
1049                 skb_reserve(skb, hh_len);
1050
1051                 /* create space for UDP/IP header */
1052                 skb_put(skb,fragheaderlen + transhdrlen);
1053
1054                 /* initialize network header pointer */
1055                 skb_reset_network_header(skb);
1056
1057                 /* initialize protocol header pointer */
1058                 skb->transport_header = skb->network_header + fragheaderlen;
1059
1060                 skb->ip_summed = CHECKSUM_PARTIAL;
1061                 skb->csum = 0;
1062         }
1063
1064         err = skb_append_datato_frags(sk,skb, getfrag, from,
1065                                       (length - transhdrlen));
1066         if (!err) {
1067                 struct frag_hdr fhdr;
1068
1069                 /* Specify the length of each IPv6 datagram fragment.
1070                  * It has to be a multiple of 8.
1071                  */
1072                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073                                              sizeof(struct frag_hdr)) & ~7;
1074                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075                 ipv6_select_ident(&fhdr, rt);
1076                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077                 __skb_queue_tail(&sk->sk_write_queue, skb);
1078
1079                 return 0;
1080         }
1081         /* There is not enough support do UPD LSO,
1082          * so follow normal path
1083          */
1084         kfree_skb(skb);
1085
1086         return err;
1087 }
1088
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090                                                gfp_t gfp)
1091 {
1092         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096                                                 gfp_t gfp)
1097 {
1098         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100
1101 static void ip6_append_data_mtu(int *mtu,
1102                                 int *maxfraglen,
1103                                 unsigned int fragheaderlen,
1104                                 struct sk_buff *skb,
1105                                 struct rt6_info *rt)
1106 {
1107         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1108                 if (skb == NULL) {
1109                         /* first fragment, reserve header_len */
1110                         *mtu = *mtu - rt->dst.header_len;
1111
1112                 } else {
1113                         /*
1114                          * this fragment is not first, the headers
1115                          * space is regarded as data space.
1116                          */
1117                         *mtu = dst_mtu(rt->dst.path);
1118                 }
1119                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1120                               + fragheaderlen - sizeof(struct frag_hdr);
1121         }
1122 }
1123
1124 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1125         int offset, int len, int odd, struct sk_buff *skb),
1126         void *from, int length, int transhdrlen,
1127         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1128         struct rt6_info *rt, unsigned int flags, int dontfrag)
1129 {
1130         struct inet_sock *inet = inet_sk(sk);
1131         struct ipv6_pinfo *np = inet6_sk(sk);
1132         struct inet_cork *cork;
1133         struct sk_buff *skb, *skb_prev = NULL;
1134         unsigned int maxfraglen, fragheaderlen;
1135         int exthdrlen;
1136         int dst_exthdrlen;
1137         int hh_len;
1138         int mtu;
1139         int copy;
1140         int err;
1141         int offset = 0;
1142         __u8 tx_flags = 0;
1143
1144         if (flags&MSG_PROBE)
1145                 return 0;
1146         cork = &inet->cork.base;
1147         if (skb_queue_empty(&sk->sk_write_queue)) {
1148                 /*
1149                  * setup for corking
1150                  */
1151                 if (opt) {
1152                         if (WARN_ON(np->cork.opt))
1153                                 return -EINVAL;
1154
1155                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1156                         if (unlikely(np->cork.opt == NULL))
1157                                 return -ENOBUFS;
1158
1159                         np->cork.opt->tot_len = opt->tot_len;
1160                         np->cork.opt->opt_flen = opt->opt_flen;
1161                         np->cork.opt->opt_nflen = opt->opt_nflen;
1162
1163                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1164                                                             sk->sk_allocation);
1165                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1166                                 return -ENOBUFS;
1167
1168                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1169                                                             sk->sk_allocation);
1170                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1171                                 return -ENOBUFS;
1172
1173                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1174                                                            sk->sk_allocation);
1175                         if (opt->hopopt && !np->cork.opt->hopopt)
1176                                 return -ENOBUFS;
1177
1178                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1179                                                             sk->sk_allocation);
1180                         if (opt->srcrt && !np->cork.opt->srcrt)
1181                                 return -ENOBUFS;
1182
1183                         /* need source address above miyazawa*/
1184                 }
1185                 dst_hold(&rt->dst);
1186                 cork->dst = &rt->dst;
1187                 inet->cork.fl.u.ip6 = *fl6;
1188                 np->cork.hop_limit = hlimit;
1189                 np->cork.tclass = tclass;
1190                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1191                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1192                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1193                 else
1194                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1195                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1196                 if (np->frag_size < mtu) {
1197                         if (np->frag_size)
1198                                 mtu = np->frag_size;
1199                 }
1200                 cork->fragsize = mtu;
1201                 if (dst_allfrag(rt->dst.path))
1202                         cork->flags |= IPCORK_ALLFRAG;
1203                 cork->length = 0;
1204                 exthdrlen = (opt ? opt->opt_flen : 0);
1205                 length += exthdrlen;
1206                 transhdrlen += exthdrlen;
1207                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1208         } else {
1209                 rt = (struct rt6_info *)cork->dst;
1210                 fl6 = &inet->cork.fl.u.ip6;
1211                 opt = np->cork.opt;
1212                 transhdrlen = 0;
1213                 exthdrlen = 0;
1214                 dst_exthdrlen = 0;
1215                 mtu = cork->fragsize;
1216         }
1217
1218         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1219
1220         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1221                         (opt ? opt->opt_nflen : 0);
1222         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1223
1224         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1225                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1226                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1227                         return -EMSGSIZE;
1228                 }
1229         }
1230
1231         /* For UDP, check if TX timestamp is enabled */
1232         if (sk->sk_type == SOCK_DGRAM)
1233                 sock_tx_timestamp(sk, &tx_flags);
1234
1235         /*
1236          * Let's try using as much space as possible.
1237          * Use MTU if total length of the message fits into the MTU.
1238          * Otherwise, we need to reserve fragment header and
1239          * fragment alignment (= 8-15 octects, in total).
1240          *
1241          * Note that we may need to "move" the data from the tail of
1242          * of the buffer to the new fragment when we split
1243          * the message.
1244          *
1245          * FIXME: It may be fragmented into multiple chunks
1246          *        at once if non-fragmentable extension headers
1247          *        are too large.
1248          * --yoshfuji
1249          */
1250
1251         cork->length += length;
1252         if (length > mtu) {
1253                 int proto = sk->sk_protocol;
1254                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1255                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1256                         return -EMSGSIZE;
1257                 }
1258
1259                 if (proto == IPPROTO_UDP &&
1260                     (rt->dst.dev->features & NETIF_F_UFO)) {
1261
1262                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1263                                                   hh_len, fragheaderlen,
1264                                                   transhdrlen, mtu, flags, rt);
1265                         if (err)
1266                                 goto error;
1267                         return 0;
1268                 }
1269         }
1270
1271         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1272                 goto alloc_new_skb;
1273
1274         while (length > 0) {
1275                 /* Check if the remaining data fits into current packet. */
1276                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1277                 if (copy < length)
1278                         copy = maxfraglen - skb->len;
1279
1280                 if (copy <= 0) {
1281                         char *data;
1282                         unsigned int datalen;
1283                         unsigned int fraglen;
1284                         unsigned int fraggap;
1285                         unsigned int alloclen;
1286 alloc_new_skb:
1287                         /* There's no room in the current skb */
1288                         if (skb)
1289                                 fraggap = skb->len - maxfraglen;
1290                         else
1291                                 fraggap = 0;
1292                         /* update mtu and maxfraglen if necessary */
1293                         if (skb == NULL || skb_prev == NULL)
1294                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1295                                                     fragheaderlen, skb, rt);
1296
1297                         skb_prev = skb;
1298
1299                         /*
1300                          * If remaining data exceeds the mtu,
1301                          * we know we need more fragment(s).
1302                          */
1303                         datalen = length + fraggap;
1304
1305                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1306                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1307                         if ((flags & MSG_MORE) &&
1308                             !(rt->dst.dev->features&NETIF_F_SG))
1309                                 alloclen = mtu;
1310                         else
1311                                 alloclen = datalen + fragheaderlen;
1312
1313                         alloclen += dst_exthdrlen;
1314
1315                         if (datalen != length + fraggap) {
1316                                 /*
1317                                  * this is not the last fragment, the trailer
1318                                  * space is regarded as data space.
1319                                  */
1320                                 datalen += rt->dst.trailer_len;
1321                         }
1322
1323                         alloclen += rt->dst.trailer_len;
1324                         fraglen = datalen + fragheaderlen;
1325
1326                         /*
1327                          * We just reserve space for fragment header.
1328                          * Note: this may be overallocation if the message
1329                          * (without MSG_MORE) fits into the MTU.
1330                          */
1331                         alloclen += sizeof(struct frag_hdr);
1332
1333                         if (transhdrlen) {
1334                                 skb = sock_alloc_send_skb(sk,
1335                                                 alloclen + hh_len,
1336                                                 (flags & MSG_DONTWAIT), &err);
1337                         } else {
1338                                 skb = NULL;
1339                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1340                                     2 * sk->sk_sndbuf)
1341                                         skb = sock_wmalloc(sk,
1342                                                            alloclen + hh_len, 1,
1343                                                            sk->sk_allocation);
1344                                 if (unlikely(skb == NULL))
1345                                         err = -ENOBUFS;
1346                                 else {
1347                                         /* Only the initial fragment
1348                                          * is time stamped.
1349                                          */
1350                                         tx_flags = 0;
1351                                 }
1352                         }
1353                         if (skb == NULL)
1354                                 goto error;
1355                         /*
1356                          *      Fill in the control structures
1357                          */
1358                         skb->ip_summed = CHECKSUM_NONE;
1359                         skb->csum = 0;
1360                         /* reserve for fragmentation and ipsec header */
1361                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1362                                     dst_exthdrlen);
1363
1364                         if (sk->sk_type == SOCK_DGRAM)
1365                                 skb_shinfo(skb)->tx_flags = tx_flags;
1366
1367                         /*
1368                          *      Find where to start putting bytes
1369                          */
1370                         data = skb_put(skb, fraglen);
1371                         skb_set_network_header(skb, exthdrlen);
1372                         data += fragheaderlen;
1373                         skb->transport_header = (skb->network_header +
1374                                                  fragheaderlen);
1375                         if (fraggap) {
1376                                 skb->csum = skb_copy_and_csum_bits(
1377                                         skb_prev, maxfraglen,
1378                                         data + transhdrlen, fraggap, 0);
1379                                 skb_prev->csum = csum_sub(skb_prev->csum,
1380                                                           skb->csum);
1381                                 data += fraggap;
1382                                 pskb_trim_unique(skb_prev, maxfraglen);
1383                         }
1384                         copy = datalen - transhdrlen - fraggap;
1385
1386                         if (copy < 0) {
1387                                 err = -EINVAL;
1388                                 kfree_skb(skb);
1389                                 goto error;
1390                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1391                                 err = -EFAULT;
1392                                 kfree_skb(skb);
1393                                 goto error;
1394                         }
1395
1396                         offset += copy;
1397                         length -= datalen - fraggap;
1398                         transhdrlen = 0;
1399                         exthdrlen = 0;
1400                         dst_exthdrlen = 0;
1401
1402                         /*
1403                          * Put the packet on the pending queue
1404                          */
1405                         __skb_queue_tail(&sk->sk_write_queue, skb);
1406                         continue;
1407                 }
1408
1409                 if (copy > length)
1410                         copy = length;
1411
1412                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1413                         unsigned int off;
1414
1415                         off = skb->len;
1416                         if (getfrag(from, skb_put(skb, copy),
1417                                                 offset, copy, off, skb) < 0) {
1418                                 __skb_trim(skb, off);
1419                                 err = -EFAULT;
1420                                 goto error;
1421                         }
1422                 } else {
1423                         int i = skb_shinfo(skb)->nr_frags;
1424                         struct page_frag *pfrag = sk_page_frag(sk);
1425
1426                         err = -ENOMEM;
1427                         if (!sk_page_frag_refill(sk, pfrag))
1428                                 goto error;
1429
1430                         if (!skb_can_coalesce(skb, i, pfrag->page,
1431                                               pfrag->offset)) {
1432                                 err = -EMSGSIZE;
1433                                 if (i == MAX_SKB_FRAGS)
1434                                         goto error;
1435
1436                                 __skb_fill_page_desc(skb, i, pfrag->page,
1437                                                      pfrag->offset, 0);
1438                                 skb_shinfo(skb)->nr_frags = ++i;
1439                                 get_page(pfrag->page);
1440                         }
1441                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1442                         if (getfrag(from,
1443                                     page_address(pfrag->page) + pfrag->offset,
1444                                     offset, copy, skb->len, skb) < 0)
1445                                 goto error_efault;
1446
1447                         pfrag->offset += copy;
1448                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1449                         skb->len += copy;
1450                         skb->data_len += copy;
1451                         skb->truesize += copy;
1452                         atomic_add(copy, &sk->sk_wmem_alloc);
1453                 }
1454                 offset += copy;
1455                 length -= copy;
1456         }
1457
1458         return 0;
1459
1460 error_efault:
1461         err = -EFAULT;
1462 error:
1463         cork->length -= length;
1464         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1465         return err;
1466 }
1467 EXPORT_SYMBOL_GPL(ip6_append_data);
1468
1469 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1470 {
1471         if (np->cork.opt) {
1472                 kfree(np->cork.opt->dst0opt);
1473                 kfree(np->cork.opt->dst1opt);
1474                 kfree(np->cork.opt->hopopt);
1475                 kfree(np->cork.opt->srcrt);
1476                 kfree(np->cork.opt);
1477                 np->cork.opt = NULL;
1478         }
1479
1480         if (inet->cork.base.dst) {
1481                 dst_release(inet->cork.base.dst);
1482                 inet->cork.base.dst = NULL;
1483                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1484         }
1485         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1486 }
1487
1488 int ip6_push_pending_frames(struct sock *sk)
1489 {
1490         struct sk_buff *skb, *tmp_skb;
1491         struct sk_buff **tail_skb;
1492         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1493         struct inet_sock *inet = inet_sk(sk);
1494         struct ipv6_pinfo *np = inet6_sk(sk);
1495         struct net *net = sock_net(sk);
1496         struct ipv6hdr *hdr;
1497         struct ipv6_txoptions *opt = np->cork.opt;
1498         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1499         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1500         unsigned char proto = fl6->flowi6_proto;
1501         int err = 0;
1502
1503         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1504                 goto out;
1505         tail_skb = &(skb_shinfo(skb)->frag_list);
1506
1507         /* move skb->data to ip header from ext header */
1508         if (skb->data < skb_network_header(skb))
1509                 __skb_pull(skb, skb_network_offset(skb));
1510         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1511                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1512                 *tail_skb = tmp_skb;
1513                 tail_skb = &(tmp_skb->next);
1514                 skb->len += tmp_skb->len;
1515                 skb->data_len += tmp_skb->len;
1516                 skb->truesize += tmp_skb->truesize;
1517                 tmp_skb->destructor = NULL;
1518                 tmp_skb->sk = NULL;
1519         }
1520
1521         /* Allow local fragmentation. */
1522         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1523                 skb->local_df = 1;
1524
1525         *final_dst = fl6->daddr;
1526         __skb_pull(skb, skb_network_header_len(skb));
1527         if (opt && opt->opt_flen)
1528                 ipv6_push_frag_opts(skb, opt, &proto);
1529         if (opt && opt->opt_nflen)
1530                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1531
1532         skb_push(skb, sizeof(struct ipv6hdr));
1533         skb_reset_network_header(skb);
1534         hdr = ipv6_hdr(skb);
1535
1536         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1537         hdr->hop_limit = np->cork.hop_limit;
1538         hdr->nexthdr = proto;
1539         hdr->saddr = fl6->saddr;
1540         hdr->daddr = *final_dst;
1541
1542         skb->priority = sk->sk_priority;
1543         skb->mark = sk->sk_mark;
1544
1545         skb_dst_set(skb, dst_clone(&rt->dst));
1546         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1547         if (proto == IPPROTO_ICMPV6) {
1548                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1549
1550                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1551                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1552         }
1553
1554         err = ip6_local_out(skb);
1555         if (err) {
1556                 if (err > 0)
1557                         err = net_xmit_errno(err);
1558                 if (err)
1559                         goto error;
1560         }
1561
1562 out:
1563         ip6_cork_release(inet, np);
1564         return err;
1565 error:
1566         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1567         goto out;
1568 }
1569 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1570
1571 void ip6_flush_pending_frames(struct sock *sk)
1572 {
1573         struct sk_buff *skb;
1574
1575         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1576                 if (skb_dst(skb))
1577                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1578                                       IPSTATS_MIB_OUTDISCARDS);
1579                 kfree_skb(skb);
1580         }
1581
1582         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1583 }
1584 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);