net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 int __ip6_local_out(struct sk_buff *skb)
  61 {
  62         int len;
  63
  64         len = skb->len - sizeof(struct ipv6hdr);
  65         if (len > IPV6_MAXPLEN)
  66                 len = 0;
  67         ipv6_hdr(skb)->payload_len = htons(len);
  68
  69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
  70                        dst_output);
  71 }
  72
  73 int ip6_local_out(struct sk_buff *skb)
  74 {
  75         int err;
  76
  77         err = __ip6_local_out(skb);
  78         if (likely(err == 1))
  79                 err = dst_output(skb);
  80
  81         return err;
  82 }
  83 EXPORT_SYMBOL_GPL(ip6_local_out);
  84
  85 static int ip6_output_finish(struct sk_buff *skb)
  86 {
  87         struct dst_entry *dst = skb_dst(skb);
  88
  89         if (dst->hh)
  90                 return neigh_hh_output(dst->hh, skb);
  91         else if (dst->neighbour)
  92                 return dst->neighbour->output(skb);
  93
  94         IP6_INC_STATS_BH(dev_net(dst->dev),
  95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  96         kfree_skb(skb);
  97         return -EINVAL;
  98
  99 }
 100
 101 /* dev_loopback_xmit for use with netfilter. */
 102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 103 {
 104         skb_reset_mac_header(newskb);
 105         __skb_pull(newskb, skb_network_offset(newskb));
 106         newskb->pkt_type = PACKET_LOOPBACK;
 107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 108         WARN_ON(!skb_dst(newskb));
 109
 110         netif_rx(newskb);
 111         return 0;
 112 }
 113
 114
 115 static int ip6_output2(struct sk_buff *skb)
 116 {
 117         struct dst_entry *dst = skb_dst(skb);
 118         struct net_device *dev = dst->dev;
 119
 120         skb->protocol = htons(ETH_P_IPV6);
 121         skb->dev = dev;
 122
 123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 124                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 125
 126                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 127                     ((mroute6_socket(dev_net(dev)) &&
 128                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 129                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 130                                          &ipv6_hdr(skb)->saddr))) {
 131                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 132
 133                         /* Do not check for IFF_ALLMULTI; multicast routing
 134                            is not supported in any case.
 135                          */
 136                         if (newskb)
 137                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 138                                         NULL, newskb->dev,
 139                                         ip6_dev_loopback_xmit);
 140
 141                         if (ipv6_hdr(skb)->hop_limit == 0) {
 142                                 IP6_INC_STATS(dev_net(dev), idev,
 143                                               IPSTATS_MIB_OUTDISCARDS);
 144                                 kfree_skb(skb);
 145                                 return 0;
 146                         }
 147                 }
 148
 149                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 150                                 skb->len);
 151         }
 152
 153         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 154                        ip6_output_finish);
 155 }
 156
 157 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 158 {
 159         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 160
 161         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 162                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 163 }
 164
 165 int ip6_output(struct sk_buff *skb)
 166 {
 167         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 168         if (unlikely(idev->cnf.disable_ipv6)) {
 169                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
 170                               IPSTATS_MIB_OUTDISCARDS);
 171                 kfree_skb(skb);
 172                 return 0;
 173         }
 174
 175         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 176                                 dst_allfrag(skb_dst(skb)))
 177                 return ip6_fragment(skb, ip6_output2);
 178         else
 179                 return ip6_output2(skb);
 180 }
 181
 182 /*
 183  *      xmit an sk_buff (used by TCP)
 184  */
 185
 186 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 187              struct ipv6_txoptions *opt, int ipfragok)
 188 {
 189         struct net *net = sock_net(sk);
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb_dst(skb);
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit = -1;
 197         int tclass = 0;
 198         u32 mtu;
 199
 200         if (opt) {
 201                 unsigned int head_room;
 202
 203                 /* First: exthdrs may take lots of space (~8K for now)
 204                    MAX_HEADER is not enough.
 205                  */
 206                 head_room = opt->opt_nflen + opt->opt_flen;
 207                 seg_len += head_room;
 208                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 209
 210                 if (skb_headroom(skb) < head_room) {
 211                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 212                         if (skb2 == NULL) {
 213                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 214                                               IPSTATS_MIB_OUTDISCARDS);
 215                                 kfree_skb(skb);
 216                                 return -ENOBUFS;
 217                         }
 218                         kfree_skb(skb);
 219                         skb = skb2;
 220                         if (sk)
 221                                 skb_set_owner_w(skb, sk);
 222                 }
 223                 if (opt->opt_flen)
 224                         ipv6_push_frag_opts(skb, opt, &proto);
 225                 if (opt->opt_nflen)
 226                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 227         }
 228
 229         skb_push(skb, sizeof(struct ipv6hdr));
 230         skb_reset_network_header(skb);
 231         hdr = ipv6_hdr(skb);
 232
 233         /* Allow local fragmentation. */
 234         if (ipfragok)
 235                 skb->local_df = 1;
 236
 237         /*
 238          *      Fill in the IPv6 header
 239          */
 240         if (np) {
 241                 tclass = np->tclass;
 242                 hlimit = np->hop_limit;
 243         }
 244         if (hlimit < 0)
 245                 hlimit = ip6_dst_hoplimit(dst);
 246
 247         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 254         ipv6_addr_copy(&hdr->daddr, first_hop);
 255
 256         skb->priority = sk->sk_priority;
 257         skb->mark = sk->sk_mark;
 258
 259         mtu = dst_mtu(dst);
 260         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 261                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 262                               IPSTATS_MIB_OUT, skb->len);
 263                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 264                                 dst_output);
 265         }
 266
 267         if (net_ratelimit())
 268                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 269         skb->dev = dst->dev;
 270         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 271         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 272         kfree_skb(skb);
 273         return -EMSGSIZE;
 274 }
 275
 276 EXPORT_SYMBOL(ip6_xmit);
 277
 278 /*
 279  *      To avoid extra problems ND packets are send through this
 280  *      routine. It's code duplication but I really want to avoid
 281  *      extra checks since ipv6_build_header is used by TCP (which
 282  *      is for us performance critical)
 283  */
 284
 285 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 286                const struct in6_addr *saddr, const struct in6_addr *daddr,
 287                int proto, int len)
 288 {
 289         struct ipv6_pinfo *np = inet6_sk(sk);
 290         struct ipv6hdr *hdr;
 291         int totlen;
 292
 293         skb->protocol = htons(ETH_P_IPV6);
 294         skb->dev = dev;
 295
 296         totlen = len + sizeof(struct ipv6hdr);
 297
 298         skb_reset_network_header(skb);
 299         skb_put(skb, sizeof(struct ipv6hdr));
 300         hdr = ipv6_hdr(skb);
 301
 302         *(__be32*)hdr = htonl(0x60000000);
 303
 304         hdr->payload_len = htons(len);
 305         hdr->nexthdr = proto;
 306         hdr->hop_limit = np->hop_limit;
 307
 308         ipv6_addr_copy(&hdr->saddr, saddr);
 309         ipv6_addr_copy(&hdr->daddr, daddr);
 310
 311         return 0;
 312 }
 313
 314 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 315 {
 316         struct ip6_ra_chain *ra;
 317         struct sock *last = NULL;
 318
 319         read_lock(&ip6_ra_lock);
 320         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 321                 struct sock *sk = ra->sk;
 322                 if (sk && ra->sel == sel &&
 323                     (!sk->sk_bound_dev_if ||
 324                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static int ip6_forward_proxy_check(struct sk_buff *skb)
 344 {
 345         struct ipv6hdr *hdr = ipv6_hdr(skb);
 346         u8 nexthdr = hdr->nexthdr;
 347         int offset;
 348
 349         if (ipv6_ext_hdr(nexthdr)) {
 350                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 351                 if (offset < 0)
 352                         return 0;
 353         } else
 354                 offset = sizeof(struct ipv6hdr);
 355
 356         if (nexthdr == IPPROTO_ICMPV6) {
 357                 struct icmp6hdr *icmp6;
 358
 359                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 360                                          offset + 1 - skb->data)))
 361                         return 0;
 362
 363                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 364
 365                 switch (icmp6->icmp6_type) {
 366                 case NDISC_ROUTER_SOLICITATION:
 367                 case NDISC_ROUTER_ADVERTISEMENT:
 368                 case NDISC_NEIGHBOUR_SOLICITATION:
 369                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 370                 case NDISC_REDIRECT:
 371                         /* For reaction involving unicast neighbor discovery
 372                          * message destined to the proxied address, pass it to
 373                          * input function.
 374                          */
 375                         return 1;
 376                 default:
 377                         break;
 378                 }
 379         }
 380
 381         /*
 382          * The proxying router can't forward traffic sent to a link-local
 383          * address, so signal the sender and discard the packet. This
 384          * behavior is clarified by the MIPv6 specification.
 385          */
 386         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 387                 dst_link_failure(skb);
 388                 return -1;
 389         }
 390
 391         return 0;
 392 }
 393
 394 static inline int ip6_forward_finish(struct sk_buff *skb)
 395 {
 396         return dst_output(skb);
 397 }
 398
 399 int ip6_forward(struct sk_buff *skb)
 400 {
 401         struct dst_entry *dst = skb_dst(skb);
 402         struct ipv6hdr *hdr = ipv6_hdr(skb);
 403         struct inet6_skb_parm *opt = IP6CB(skb);
 404         struct net *net = dev_net(dst->dev);
 405
 406         if (net->ipv6.devconf_all->forwarding == 0)
 407                 goto error;
 408
 409         if (skb_warn_if_lro(skb))
 410                 goto drop;
 411
 412         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 413                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 414                 goto drop;
 415         }
 416
 417         skb_forward_csum(skb);
 418
 419         /*
 420          *      We DO NOT make any processing on
 421          *      RA packets, pushing them to user level AS IS
 422          *      without ane WARRANTY that application will be able
 423          *      to interpret them. The reason is that we
 424          *      cannot make anything clever here.
 425          *
 426          *      We are not end-node, so that if packet contains
 427          *      AH/ESP, we cannot make anything.
 428          *      Defragmentation also would be mistake, RA packets
 429          *      cannot be fragmented, because there is no warranty
 430          *      that different fragments will go along one path. --ANK
 431          */
 432         if (opt->ra) {
 433                 u8 *ptr = skb_network_header(skb) + opt->ra;
 434                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 435                         return 0;
 436         }
 437
 438         /*
 439          *      check and decrement ttl
 440          */
 441         if (hdr->hop_limit <= 1) {
 442                 /* Force OUTPUT device used as source address */
 443                 skb->dev = dst->dev;
 444                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 445                             0, skb->dev);
 446                 IP6_INC_STATS_BH(net,
 447                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 448
 449                 kfree_skb(skb);
 450                 return -ETIMEDOUT;
 451         }
 452
 453         /* XXX: idev->cnf.proxy_ndp? */
 454         if (net->ipv6.devconf_all->proxy_ndp &&
 455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 456                 int proxied = ip6_forward_proxy_check(skb);
 457                 if (proxied > 0)
 458                         return ip6_input(skb);
 459                 else if (proxied < 0) {
 460                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 461                                       IPSTATS_MIB_INDISCARDS);
 462                         goto drop;
 463                 }
 464         }
 465
 466         if (!xfrm6_route_forward(skb)) {
 467                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 468                 goto drop;
 469         }
 470         dst = skb_dst(skb);
 471
 472         /* IPv6 specs say nothing about it, but it is clear that we cannot
 473            send redirects to source routed frames.
 474            We don't send redirects to frames decapsulated from IPsec.
 475          */
 476         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 477             !skb_sec_path(skb)) {
 478                 struct in6_addr *target = NULL;
 479                 struct rt6_info *rt;
 480                 struct neighbour *n = dst->neighbour;
 481
 482                 /*
 483                  *      incoming and outgoing devices are the same
 484                  *      send a redirect.
 485                  */
 486
 487                 rt = (struct rt6_info *) dst;
 488                 if ((rt->rt6i_flags & RTF_GATEWAY))
 489                         target = (struct in6_addr*)&n->primary_key;
 490                 else
 491                         target = &hdr->daddr;
 492
 493                 /* Limit redirects both by destination (here)
 494                    and by source (inside ndisc_send_redirect)
 495                  */
 496                 if (xrlim_allow(dst, 1*HZ))
 497                         ndisc_send_redirect(skb, n, target);
 498         } else {
 499                 int addrtype = ipv6_addr_type(&hdr->saddr);
 500
 501                 /* This check is security critical. */
 502                 if (addrtype == IPV6_ADDR_ANY ||
 503                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 504                         goto error;
 505                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 506                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 507                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 508                         goto error;
 509                 }
 510         }
 511
 512         if (skb->len > dst_mtu(dst)) {
 513                 /* Again, force OUTPUT device used as source address */
 514                 skb->dev = dst->dev;
 515                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 516                 IP6_INC_STATS_BH(net,
 517                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 518                 IP6_INC_STATS_BH(net,
 519                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 520                 kfree_skb(skb);
 521                 return -EMSGSIZE;
 522         }
 523
 524         if (skb_cow(skb, dst->dev->hard_header_len)) {
 525                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 526                 goto drop;
 527         }
 528
 529         hdr = ipv6_hdr(skb);
 530
 531         /* Mangling hops number delayed to point after skb COW */
 532
 533         hdr->hop_limit--;
 534
 535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 536         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 537                        ip6_forward_finish);
 538
 539 error:
 540         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 541 drop:
 542         kfree_skb(skb);
 543         return -EINVAL;
 544 }
 545
 546 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 547 {
 548         to->pkt_type = from->pkt_type;
 549         to->priority = from->priority;
 550         to->protocol = from->protocol;
 551         skb_dst_drop(to);
 552         skb_dst_set(to, dst_clone(skb_dst(from)));
 553         to->dev = from->dev;
 554         to->mark = from->mark;
 555
 556 #ifdef CONFIG_NET_SCHED
 557         to->tc_index = from->tc_index;
 558 #endif
 559         nf_copy(to, from);
 560 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 561     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 562         to->nf_trace = from->nf_trace;
 563 #endif
 564         skb_copy_secmark(to, from);
 565 }
 566
 567 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 568 {
 569         u16 offset = sizeof(struct ipv6hdr);
 570         struct ipv6_opt_hdr *exthdr =
 571                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 572         unsigned int packet_len = skb->tail - skb->network_header;
 573         int found_rhdr = 0;
 574         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 575
 576         while (offset + 1 <= packet_len) {
 577
 578                 switch (**nexthdr) {
 579
 580                 case NEXTHDR_HOP:
 581                         break;
 582                 case NEXTHDR_ROUTING:
 583                         found_rhdr = 1;
 584                         break;
 585                 case NEXTHDR_DEST:
 586 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 587                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 588                                 break;
 589 #endif
 590                         if (found_rhdr)
 591                                 return offset;
 592                         break;
 593                 default :
 594                         return offset;
 595                 }
 596
 597                 offset += ipv6_optlen(exthdr);
 598                 *nexthdr = &exthdr->nexthdr;
 599                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 600                                                  offset);
 601         }
 602
 603         return offset;
 604 }
 605
 606 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 607 {
 608         struct sk_buff *frag;
 609         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 610         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 611         struct ipv6hdr *tmp_hdr;
 612         struct frag_hdr *fh;
 613         unsigned int mtu, hlen, left, len;
 614         __be32 frag_id = 0;
 615         int ptr, offset = 0, err=0;
 616         u8 *prevhdr, nexthdr = 0;
 617         struct net *net = dev_net(skb_dst(skb)->dev);
 618
 619         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 620         nexthdr = *prevhdr;
 621
 622         mtu = ip6_skb_dst_mtu(skb);
 623
 624         /* We must not fragment if the socket is set to force MTU discovery
 625          * or if the skb it not generated by a local socket.  (This last
 626          * check should be redundant, but it's free.)
 627          */
 628         if (!skb->local_df) {
 629                 skb->dev = skb_dst(skb)->dev;
 630                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 631                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 632                               IPSTATS_MIB_FRAGFAILS);
 633                 kfree_skb(skb);
 634                 return -EMSGSIZE;
 635         }
 636
 637         if (np && np->frag_size < mtu) {
 638                 if (np->frag_size)
 639                         mtu = np->frag_size;
 640         }
 641         mtu -= hlen + sizeof(struct frag_hdr);
 642
 643         if (skb_has_frags(skb)) {
 644                 int first_len = skb_pagelen(skb);
 645                 struct sk_buff *frag2;
 646
 647                 if (first_len - hlen > mtu ||
 648                     ((first_len - hlen) & 7) ||
 649                     skb_cloned(skb))
 650                         goto slow_path;
 651
 652                 skb_walk_frags(skb, frag) {
 653                         /* Correct geometry. */
 654                         if (frag->len > mtu ||
 655                             ((frag->len & 7) && frag->next) ||
 656                             skb_headroom(frag) < hlen)
 657                                 goto slow_path_clean;
 658
 659                         /* Partially cloned skb? */
 660                         if (skb_shared(frag))
 661                                 goto slow_path_clean;
 662
 663                         BUG_ON(frag->sk);
 664                         if (skb->sk) {
 665                                 frag->sk = skb->sk;
 666                                 frag->destructor = sock_wfree;
 667                         }
 668                         skb->truesize -= frag->truesize;
 669                 }
 670
 671                 err = 0;
 672                 offset = 0;
 673                 frag = skb_shinfo(skb)->frag_list;
 674                 skb_frag_list_init(skb);
 675                 /* BUILD HEADER */
 676
 677                 *prevhdr = NEXTHDR_FRAGMENT;
 678                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 679                 if (!tmp_hdr) {
 680                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 681                                       IPSTATS_MIB_FRAGFAILS);
 682                         return -ENOMEM;
 683                 }
 684
 685                 __skb_pull(skb, hlen);
 686                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 687                 __skb_push(skb, hlen);
 688                 skb_reset_network_header(skb);
 689                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 690
 691                 ipv6_select_ident(fh);
 692                 fh->nexthdr = nexthdr;
 693                 fh->reserved = 0;
 694                 fh->frag_off = htons(IP6_MF);
 695                 frag_id = fh->identification;
 696
 697                 first_len = skb_pagelen(skb);
 698                 skb->data_len = first_len - skb_headlen(skb);
 699                 skb->len = first_len;
 700                 ipv6_hdr(skb)->payload_len = htons(first_len -
 701                                                    sizeof(struct ipv6hdr));
 702
 703                 dst_hold(&rt->u.dst);
 704
 705                 for (;;) {
 706                         /* Prepare header of the next frame,
 707                          * before previous one went down. */
 708                         if (frag) {
 709                                 frag->ip_summed = CHECKSUM_NONE;
 710                                 skb_reset_transport_header(frag);
 711                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 712                                 __skb_push(frag, hlen);
 713                                 skb_reset_network_header(frag);
 714                                 memcpy(skb_network_header(frag), tmp_hdr,
 715                                        hlen);
 716                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 717                                 fh->nexthdr = nexthdr;
 718                                 fh->reserved = 0;
 719                                 fh->frag_off = htons(offset);
 720                                 if (frag->next != NULL)
 721                                         fh->frag_off |= htons(IP6_MF);
 722                                 fh->identification = frag_id;
 723                                 ipv6_hdr(frag)->payload_len =
 724                                                 htons(frag->len -
 725                                                       sizeof(struct ipv6hdr));
 726                                 ip6_copy_metadata(frag, skb);
 727                         }
 728
 729                         err = output(skb);
 730                         if(!err)
 731                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 732                                               IPSTATS_MIB_FRAGCREATES);
 733
 734                         if (err || !frag)
 735                                 break;
 736
 737                         skb = frag;
 738                         frag = skb->next;
 739                         skb->next = NULL;
 740                 }
 741
 742                 kfree(tmp_hdr);
 743
 744                 if (err == 0) {
 745                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 746                                       IPSTATS_MIB_FRAGOKS);
 747                         dst_release(&rt->u.dst);
 748                         return 0;
 749                 }
 750
 751                 while (frag) {
 752                         skb = frag->next;
 753                         kfree_skb(frag);
 754                         frag = skb;
 755                 }
 756
 757                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 758                               IPSTATS_MIB_FRAGFAILS);
 759                 dst_release(&rt->u.dst);
 760                 return err;
 761
 762 slow_path_clean:
 763                 skb_walk_frags(skb, frag2) {
 764                         if (frag2 == frag)
 765                                 break;
 766                         frag2->sk = NULL;
 767                         frag2->destructor = NULL;
 768                         skb->truesize += frag2->truesize;
 769                 }
 770         }
 771
 772 slow_path:
 773         left = skb->len - hlen;         /* Space per frame */
 774         ptr = hlen;                     /* Where to start from */
 775
 776         /*
 777          *      Fragment the datagram.
 778          */
 779
 780         *prevhdr = NEXTHDR_FRAGMENT;
 781
 782         /*
 783          *      Keep copying data until we run out.
 784          */
 785         while(left > 0) {
 786                 len = left;
 787                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 788                 if (len > mtu)
 789                         len = mtu;
 790                 /* IF: we are not sending upto and including the packet end
 791                    then align the next start on an eight byte boundary */
 792                 if (len < left) {
 793                         len &= ~7;
 794                 }
 795                 /*
 796                  *      Allocate buffer.
 797                  */
 798
 799                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 800                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 801                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 802                                       IPSTATS_MIB_FRAGFAILS);
 803                         err = -ENOMEM;
 804                         goto fail;
 805                 }
 806
 807                 /*
 808                  *      Set up data on packet
 809                  */
 810
 811                 ip6_copy_metadata(frag, skb);
 812                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 813                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 814                 skb_reset_network_header(frag);
 815                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 816                 frag->transport_header = (frag->network_header + hlen +
 817                                           sizeof(struct frag_hdr));
 818
 819                 /*
 820                  *      Charge the memory for the fragment to any owner
 821                  *      it might possess
 822                  */
 823                 if (skb->sk)
 824                         skb_set_owner_w(frag, skb->sk);
 825
 826                 /*
 827                  *      Copy the packet header into the new buffer.
 828                  */
 829                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 830
 831                 /*
 832                  *      Build fragment header.
 833                  */
 834                 fh->nexthdr = nexthdr;
 835                 fh->reserved = 0;
 836                 if (!frag_id) {
 837                         ipv6_select_ident(fh);
 838                         frag_id = fh->identification;
 839                 } else
 840                         fh->identification = frag_id;
 841
 842                 /*
 843                  *      Copy a block of the IP datagram.
 844                  */
 845                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 846                         BUG();
 847                 left -= len;
 848
 849                 fh->frag_off = htons(offset);
 850                 if (left > 0)
 851                         fh->frag_off |= htons(IP6_MF);
 852                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 853                                                     sizeof(struct ipv6hdr));
 854
 855                 ptr += len;
 856                 offset += len;
 857
 858                 /*
 859                  *      Put this fragment into the sending queue.
 860                  */
 861                 err = output(frag);
 862                 if (err)
 863                         goto fail;
 864
 865                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 866                               IPSTATS_MIB_FRAGCREATES);
 867         }
 868         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 869                       IPSTATS_MIB_FRAGOKS);
 870         kfree_skb(skb);
 871         return err;
 872
 873 fail:
 874         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 875                       IPSTATS_MIB_FRAGFAILS);
 876         kfree_skb(skb);
 877         return err;
 878 }
 879
 880 static inline int ip6_rt_check(struct rt6key *rt_key,
 881                                struct in6_addr *fl_addr,
 882                                struct in6_addr *addr_cache)
 883 {
 884         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 885                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 886 }
 887
 888 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 889                                           struct dst_entry *dst,
 890                                           struct flowi *fl)
 891 {
 892         struct ipv6_pinfo *np = inet6_sk(sk);
 893         struct rt6_info *rt = (struct rt6_info *)dst;
 894
 895         if (!dst)
 896                 goto out;
 897
 898         /* Yes, checking route validity in not connected
 899          * case is not very simple. Take into account,
 900          * that we do not support routing by source, TOS,
 901          * and MSG_DONTROUTE            --ANK (980726)
 902          *
 903          * 1. ip6_rt_check(): If route was host route,
 904          *    check that cached destination is current.
 905          *    If it is network route, we still may
 906          *    check its validity using saved pointer
 907          *    to the last used address: daddr_cache.
 908          *    We do not want to save whole address now,
 909          *    (because main consumer of this service
 910          *    is tcp, which has not this problem),
 911          *    so that the last trick works only on connected
 912          *    sockets.
 913          * 2. oif also should be the same.
 914          */
 915         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 916 #ifdef CONFIG_IPV6_SUBTREES
 917             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 918 #endif
 919             (fl->oif && fl->oif != dst->dev->ifindex)) {
 920                 dst_release(dst);
 921                 dst = NULL;
 922         }
 923
 924 out:
 925         return dst;
 926 }
 927
 928 static int ip6_dst_lookup_tail(struct sock *sk,
 929                                struct dst_entry **dst, struct flowi *fl)
 930 {
 931         int err;
 932         struct net *net = sock_net(sk);
 933
 934         if (*dst == NULL)
 935                 *dst = ip6_route_output(net, sk, fl);
 936
 937         if ((err = (*dst)->error))
 938                 goto out_err_release;
 939
 940         if (ipv6_addr_any(&fl->fl6_src)) {
 941                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 942                                          &fl->fl6_dst,
 943                                          sk ? inet6_sk(sk)->srcprefs : 0,
 944                                          &fl->fl6_src);
 945                 if (err)
 946                         goto out_err_release;
 947         }
 948
 949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 950         /*
 951          * Here if the dst entry we've looked up
 952          * has a neighbour entry that is in the INCOMPLETE
 953          * state and the src address from the flow is
 954          * marked as OPTIMISTIC, we release the found
 955          * dst entry and replace it instead with the
 956          * dst entry of the nexthop router
 957          */
 958         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 959                 struct inet6_ifaddr *ifp;
 960                 struct flowi fl_gw;
 961                 int redirect;
 962
 963                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 964                                       (*dst)->dev, 1);
 965
 966                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 967                 if (ifp)
 968                         in6_ifa_put(ifp);
 969
 970                 if (redirect) {
 971                         /*
 972                          * We need to get the dst entry for the
 973                          * default router instead
 974                          */
 975                         dst_release(*dst);
 976                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 977                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 978                         *dst = ip6_route_output(net, sk, &fl_gw);
 979                         if ((err = (*dst)->error))
 980                                 goto out_err_release;
 981                 }
 982         }
 983 #endif
 984
 985         return 0;
 986
 987 out_err_release:
 988         if (err == -ENETUNREACH)
 989                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 990         dst_release(*dst);
 991         *dst = NULL;
 992         return err;
 993 }
 994
 995 /**
 996  *      ip6_dst_lookup - perform route lookup on flow
 997  *      @sk: socket which provides route info
 998  *      @dst: pointer to dst_entry * for result
 999  *      @fl: flow to lookup
1000  *
1001  *      This function performs a route lookup on the given flow.
1002  *
1003  *      It returns zero on success, or a standard errno code on error.
1004  */
1005 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1006 {
1007         *dst = NULL;
1008         return ip6_dst_lookup_tail(sk, dst, fl);
1009 }
1010 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1011
1012 /**
1013  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1014  *      @sk: socket which provides the dst cache and route info
1015  *      @dst: pointer to dst_entry * for result
1016  *      @fl: flow to lookup
1017  *
1018  *      This function performs a route lookup on the given flow with the
1019  *      possibility of using the cached route in the socket if it is valid.
1020  *      It will take the socket dst lock when operating on the dst cache.
1021  *      As a result, this function can only be used in process context.
1022  *
1023  *      It returns zero on success, or a standard errno code on error.
1024  */
1025 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1026 {
1027         *dst = NULL;
1028         if (sk) {
1029                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1030                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1031         }
1032
1033         return ip6_dst_lookup_tail(sk, dst, fl);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1036
1037 static inline int ip6_ufo_append_data(struct sock *sk,
1038                         int getfrag(void *from, char *to, int offset, int len,
1039                         int odd, struct sk_buff *skb),
1040                         void *from, int length, int hh_len, int fragheaderlen,
1041                         int transhdrlen, int mtu,unsigned int flags)
1042
1043 {
1044         struct sk_buff *skb;
1045         int err;
1046
1047         /* There is support for UDP large send offload by network
1048          * device, so create one single skb packet containing complete
1049          * udp datagram
1050          */
1051         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1052                 skb = sock_alloc_send_skb(sk,
1053                         hh_len + fragheaderlen + transhdrlen + 20,
1054                         (flags & MSG_DONTWAIT), &err);
1055                 if (skb == NULL)
1056                         return -ENOMEM;
1057
1058                 /* reserve space for Hardware header */
1059                 skb_reserve(skb, hh_len);
1060
1061                 /* create space for UDP/IP header */
1062                 skb_put(skb,fragheaderlen + transhdrlen);
1063
1064                 /* initialize network header pointer */
1065                 skb_reset_network_header(skb);
1066
1067                 /* initialize protocol header pointer */
1068                 skb->transport_header = skb->network_header + fragheaderlen;
1069
1070                 skb->ip_summed = CHECKSUM_PARTIAL;
1071                 skb->csum = 0;
1072                 sk->sk_sndmsg_off = 0;
1073         }
1074
1075         err = skb_append_datato_frags(sk,skb, getfrag, from,
1076                                       (length - transhdrlen));
1077         if (!err) {
1078                 struct frag_hdr fhdr;
1079
1080                 /* Specify the length of each IPv6 datagram fragment.
1081                  * It has to be a multiple of 8.
1082                  */
1083                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1084                                              sizeof(struct frag_hdr)) & ~7;
1085                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1086                 ipv6_select_ident(&fhdr);
1087                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1088                 __skb_queue_tail(&sk->sk_write_queue, skb);
1089
1090                 return 0;
1091         }
1092         /* There is not enough support do UPD LSO,
1093          * so follow normal path
1094          */
1095         kfree_skb(skb);
1096
1097         return err;
1098 }
1099
1100 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1101                                                gfp_t gfp)
1102 {
1103         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1104 }
1105
1106 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1107                                                 gfp_t gfp)
1108 {
1109         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1110 }
1111
1112 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1113         int offset, int len, int odd, struct sk_buff *skb),
1114         void *from, int length, int transhdrlen,
1115         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1116         struct rt6_info *rt, unsigned int flags)
1117 {
1118         struct inet_sock *inet = inet_sk(sk);
1119         struct ipv6_pinfo *np = inet6_sk(sk);
1120         struct sk_buff *skb;
1121         unsigned int maxfraglen, fragheaderlen;
1122         int exthdrlen;
1123         int hh_len;
1124         int mtu;
1125         int copy;
1126         int err;
1127         int offset = 0;
1128         int csummode = CHECKSUM_NONE;
1129
1130         if (flags&MSG_PROBE)
1131                 return 0;
1132         if (skb_queue_empty(&sk->sk_write_queue)) {
1133                 /*
1134                  * setup for corking
1135                  */
1136                 if (opt) {
1137                         if (WARN_ON(np->cork.opt))
1138                                 return -EINVAL;
1139
1140                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1141                         if (unlikely(np->cork.opt == NULL))
1142                                 return -ENOBUFS;
1143
1144                         np->cork.opt->tot_len = opt->tot_len;
1145                         np->cork.opt->opt_flen = opt->opt_flen;
1146                         np->cork.opt->opt_nflen = opt->opt_nflen;
1147
1148                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1149                                                             sk->sk_allocation);
1150                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1151                                 return -ENOBUFS;
1152
1153                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1154                                                             sk->sk_allocation);
1155                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1156                                 return -ENOBUFS;
1157
1158                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1159                                                            sk->sk_allocation);
1160                         if (opt->hopopt && !np->cork.opt->hopopt)
1161                                 return -ENOBUFS;
1162
1163                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1164                                                             sk->sk_allocation);
1165                         if (opt->srcrt && !np->cork.opt->srcrt)
1166                                 return -ENOBUFS;
1167
1168                         /* need source address above miyazawa*/
1169                 }
1170                 dst_hold(&rt->u.dst);
1171                 inet->cork.dst = &rt->u.dst;
1172                 inet->cork.fl = *fl;
1173                 np->cork.hop_limit = hlimit;
1174                 np->cork.tclass = tclass;
1175                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1176                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1177                 if (np->frag_size < mtu) {
1178                         if (np->frag_size)
1179                                 mtu = np->frag_size;
1180                 }
1181                 inet->cork.fragsize = mtu;
1182                 if (dst_allfrag(rt->u.dst.path))
1183                         inet->cork.flags |= IPCORK_ALLFRAG;
1184                 inet->cork.length = 0;
1185                 sk->sk_sndmsg_page = NULL;
1186                 sk->sk_sndmsg_off = 0;
1187                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1188                             rt->rt6i_nfheader_len;
1189                 length += exthdrlen;
1190                 transhdrlen += exthdrlen;
1191         } else {
1192                 rt = (struct rt6_info *)inet->cork.dst;
1193                 fl = &inet->cork.fl;
1194                 opt = np->cork.opt;
1195                 transhdrlen = 0;
1196                 exthdrlen = 0;
1197                 mtu = inet->cork.fragsize;
1198         }
1199
1200         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1201
1202         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1203                         (opt ? opt->opt_nflen : 0);
1204         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1205
1206         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1207                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1208                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1209                         return -EMSGSIZE;
1210                 }
1211         }
1212
1213         /*
1214          * Let's try using as much space as possible.
1215          * Use MTU if total length of the message fits into the MTU.
1216          * Otherwise, we need to reserve fragment header and
1217          * fragment alignment (= 8-15 octects, in total).
1218          *
1219          * Note that we may need to "move" the data from the tail of
1220          * of the buffer to the new fragment when we split
1221          * the message.
1222          *
1223          * FIXME: It may be fragmented into multiple chunks
1224          *        at once if non-fragmentable extension headers
1225          *        are too large.
1226          * --yoshfuji
1227          */
1228
1229         inet->cork.length += length;
1230         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1231             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1232
1233                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1234                                           fragheaderlen, transhdrlen, mtu,
1235                                           flags);
1236                 if (err)
1237                         goto error;
1238                 return 0;
1239         }
1240
1241         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1242                 goto alloc_new_skb;
1243
1244         while (length > 0) {
1245                 /* Check if the remaining data fits into current packet. */
1246                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1247                 if (copy < length)
1248                         copy = maxfraglen - skb->len;
1249
1250                 if (copy <= 0) {
1251                         char *data;
1252                         unsigned int datalen;
1253                         unsigned int fraglen;
1254                         unsigned int fraggap;
1255                         unsigned int alloclen;
1256                         struct sk_buff *skb_prev;
1257 alloc_new_skb:
1258                         skb_prev = skb;
1259
1260                         /* There's no room in the current skb */
1261                         if (skb_prev)
1262                                 fraggap = skb_prev->len - maxfraglen;
1263                         else
1264                                 fraggap = 0;
1265
1266                         /*
1267                          * If remaining data exceeds the mtu,
1268                          * we know we need more fragment(s).
1269                          */
1270                         datalen = length + fraggap;
1271                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1272                                 datalen = maxfraglen - fragheaderlen;
1273
1274                         fraglen = datalen + fragheaderlen;
1275                         if ((flags & MSG_MORE) &&
1276                             !(rt->u.dst.dev->features&NETIF_F_SG))
1277                                 alloclen = mtu;
1278                         else
1279                                 alloclen = datalen + fragheaderlen;
1280
1281                         /*
1282                          * The last fragment gets additional space at tail.
1283                          * Note: we overallocate on fragments with MSG_MODE
1284                          * because we have no idea if we're the last one.
1285                          */
1286                         if (datalen == length + fraggap)
1287                                 alloclen += rt->u.dst.trailer_len;
1288
1289                         /*
1290                          * We just reserve space for fragment header.
1291                          * Note: this may be overallocation if the message
1292                          * (without MSG_MORE) fits into the MTU.
1293                          */
1294                         alloclen += sizeof(struct frag_hdr);
1295
1296                         if (transhdrlen) {
1297                                 skb = sock_alloc_send_skb(sk,
1298                                                 alloclen + hh_len,
1299                                                 (flags & MSG_DONTWAIT), &err);
1300                         } else {
1301                                 skb = NULL;
1302                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1303                                     2 * sk->sk_sndbuf)
1304                                         skb = sock_wmalloc(sk,
1305                                                            alloclen + hh_len, 1,
1306                                                            sk->sk_allocation);
1307                                 if (unlikely(skb == NULL))
1308                                         err = -ENOBUFS;
1309                         }
1310                         if (skb == NULL)
1311                                 goto error;
1312                         /*
1313                          *      Fill in the control structures
1314                          */
1315                         skb->ip_summed = csummode;
1316                         skb->csum = 0;
1317                         /* reserve for fragmentation */
1318                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1319
1320                         /*
1321                          *      Find where to start putting bytes
1322                          */
1323                         data = skb_put(skb, fraglen);
1324                         skb_set_network_header(skb, exthdrlen);
1325                         data += fragheaderlen;
1326                         skb->transport_header = (skb->network_header +
1327                                                  fragheaderlen);
1328                         if (fraggap) {
1329                                 skb->csum = skb_copy_and_csum_bits(
1330                                         skb_prev, maxfraglen,
1331                                         data + transhdrlen, fraggap, 0);
1332                                 skb_prev->csum = csum_sub(skb_prev->csum,
1333                                                           skb->csum);
1334                                 data += fraggap;
1335                                 pskb_trim_unique(skb_prev, maxfraglen);
1336                         }
1337                         copy = datalen - transhdrlen - fraggap;
1338                         if (copy < 0) {
1339                                 err = -EINVAL;
1340                                 kfree_skb(skb);
1341                                 goto error;
1342                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1343                                 err = -EFAULT;
1344                                 kfree_skb(skb);
1345                                 goto error;
1346                         }
1347
1348                         offset += copy;
1349                         length -= datalen - fraggap;
1350                         transhdrlen = 0;
1351                         exthdrlen = 0;
1352                         csummode = CHECKSUM_NONE;
1353
1354                         /*
1355                          * Put the packet on the pending queue
1356                          */
1357                         __skb_queue_tail(&sk->sk_write_queue, skb);
1358                         continue;
1359                 }
1360
1361                 if (copy > length)
1362                         copy = length;
1363
1364                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1365                         unsigned int off;
1366
1367                         off = skb->len;
1368                         if (getfrag(from, skb_put(skb, copy),
1369                                                 offset, copy, off, skb) < 0) {
1370                                 __skb_trim(skb, off);
1371                                 err = -EFAULT;
1372                                 goto error;
1373                         }
1374                 } else {
1375                         int i = skb_shinfo(skb)->nr_frags;
1376                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1377                         struct page *page = sk->sk_sndmsg_page;
1378                         int off = sk->sk_sndmsg_off;
1379                         unsigned int left;
1380
1381                         if (page && (left = PAGE_SIZE - off) > 0) {
1382                                 if (copy >= left)
1383                                         copy = left;
1384                                 if (page != frag->page) {
1385                                         if (i == MAX_SKB_FRAGS) {
1386                                                 err = -EMSGSIZE;
1387                                                 goto error;
1388                                         }
1389                                         get_page(page);
1390                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1391                                         frag = &skb_shinfo(skb)->frags[i];
1392                                 }
1393                         } else if(i < MAX_SKB_FRAGS) {
1394                                 if (copy > PAGE_SIZE)
1395                                         copy = PAGE_SIZE;
1396                                 page = alloc_pages(sk->sk_allocation, 0);
1397                                 if (page == NULL) {
1398                                         err = -ENOMEM;
1399                                         goto error;
1400                                 }
1401                                 sk->sk_sndmsg_page = page;
1402                                 sk->sk_sndmsg_off = 0;
1403
1404                                 skb_fill_page_desc(skb, i, page, 0, 0);
1405                                 frag = &skb_shinfo(skb)->frags[i];
1406                         } else {
1407                                 err = -EMSGSIZE;
1408                                 goto error;
1409                         }
1410                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1411                                 err = -EFAULT;
1412                                 goto error;
1413                         }
1414                         sk->sk_sndmsg_off += copy;
1415                         frag->size += copy;
1416                         skb->len += copy;
1417                         skb->data_len += copy;
1418                         skb->truesize += copy;
1419                         atomic_add(copy, &sk->sk_wmem_alloc);
1420                 }
1421                 offset += copy;
1422                 length -= copy;
1423         }
1424         return 0;
1425 error:
1426         inet->cork.length -= length;
1427         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1428         return err;
1429 }
1430
1431 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1432 {
1433         if (np->cork.opt) {
1434                 kfree(np->cork.opt->dst0opt);
1435                 kfree(np->cork.opt->dst1opt);
1436                 kfree(np->cork.opt->hopopt);
1437                 kfree(np->cork.opt->srcrt);
1438                 kfree(np->cork.opt);
1439                 np->cork.opt = NULL;
1440         }
1441
1442         if (inet->cork.dst) {
1443                 dst_release(inet->cork.dst);
1444                 inet->cork.dst = NULL;
1445                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1446         }
1447         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1448 }
1449
1450 int ip6_push_pending_frames(struct sock *sk)
1451 {
1452         struct sk_buff *skb, *tmp_skb;
1453         struct sk_buff **tail_skb;
1454         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1455         struct inet_sock *inet = inet_sk(sk);
1456         struct ipv6_pinfo *np = inet6_sk(sk);
1457         struct net *net = sock_net(sk);
1458         struct ipv6hdr *hdr;
1459         struct ipv6_txoptions *opt = np->cork.opt;
1460         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1461         struct flowi *fl = &inet->cork.fl;
1462         unsigned char proto = fl->proto;
1463         int err = 0;
1464
1465         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1466                 goto out;
1467         tail_skb = &(skb_shinfo(skb)->frag_list);
1468
1469         /* move skb->data to ip header from ext header */
1470         if (skb->data < skb_network_header(skb))
1471                 __skb_pull(skb, skb_network_offset(skb));
1472         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1473                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1474                 *tail_skb = tmp_skb;
1475                 tail_skb = &(tmp_skb->next);
1476                 skb->len += tmp_skb->len;
1477                 skb->data_len += tmp_skb->len;
1478                 skb->truesize += tmp_skb->truesize;
1479                 tmp_skb->destructor = NULL;
1480                 tmp_skb->sk = NULL;
1481         }
1482
1483         /* Allow local fragmentation. */
1484         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1485                 skb->local_df = 1;
1486
1487         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1488         __skb_pull(skb, skb_network_header_len(skb));
1489         if (opt && opt->opt_flen)
1490                 ipv6_push_frag_opts(skb, opt, &proto);
1491         if (opt && opt->opt_nflen)
1492                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1493
1494         skb_push(skb, sizeof(struct ipv6hdr));
1495         skb_reset_network_header(skb);
1496         hdr = ipv6_hdr(skb);
1497
1498         *(__be32*)hdr = fl->fl6_flowlabel |
1499                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1500
1501         hdr->hop_limit = np->cork.hop_limit;
1502         hdr->nexthdr = proto;
1503         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1504         ipv6_addr_copy(&hdr->daddr, final_dst);
1505
1506         skb->priority = sk->sk_priority;
1507         skb->mark = sk->sk_mark;
1508
1509         skb_dst_set(skb, dst_clone(&rt->u.dst));
1510         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1511         if (proto == IPPROTO_ICMPV6) {
1512                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1513
1514                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1515                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1516         }
1517
1518         err = ip6_local_out(skb);
1519         if (err) {
1520                 if (err > 0)
1521                         err = net_xmit_errno(err);
1522                 if (err)
1523                         goto error;
1524         }
1525
1526 out:
1527         ip6_cork_release(inet, np);
1528         return err;
1529 error:
1530         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1531         goto out;
1532 }
1533
1534 void ip6_flush_pending_frames(struct sock *sk)
1535 {
1536         struct sk_buff *skb;
1537
1538         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1539                 if (skb_dst(skb))
1540                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1541                                       IPSTATS_MIB_OUTDISCARDS);
1542                 kfree_skb(skb);
1543         }
1544
1545         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1546 }