net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 107         kfree_skb(skb);
 108         return -EINVAL;
 109
 110 }
 111
 112 /* dev_loopback_xmit for use with netfilter. */
 113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 114 {
 115         skb_reset_mac_header(newskb);
 116         __skb_pull(newskb, skb_network_offset(newskb));
 117         newskb->pkt_type = PACKET_LOOPBACK;
 118         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 119         WARN_ON(!newskb->dst);
 120
 121         netif_rx(newskb);
 122         return 0;
 123 }
 124
 125
 126 static int ip6_output2(struct sk_buff *skb)
 127 {
 128         struct dst_entry *dst = skb->dst;
 129         struct net_device *dev = dst->dev;
 130
 131         skb->protocol = htons(ETH_P_IPV6);
 132         skb->dev = dev;
 133
 134         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 135                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 136                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 137
 138                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 139                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 140                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                          &ipv6_hdr(skb)->saddr))) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 177         if (unlikely(idev->cnf.disable_ipv6)) {
 178                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 179                 kfree_skb(skb);
 180                 return 0;
 181         }
 182
 183         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 184                                 dst_allfrag(skb->dst))
 185                 return ip6_fragment(skb, ip6_output2);
 186         else
 187                 return ip6_output2(skb);
 188 }
 189
 190 /*
 191  *      xmit an sk_buff (used by TCP)
 192  */
 193
 194 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 195              struct ipv6_txoptions *opt, int ipfragok)
 196 {
 197         struct ipv6_pinfo *np = inet6_sk(sk);
 198         struct in6_addr *first_hop = &fl->fl6_dst;
 199         struct dst_entry *dst = skb->dst;
 200         struct ipv6hdr *hdr;
 201         u8  proto = fl->proto;
 202         int seg_len = skb->len;
 203         int hlimit, tclass;
 204         u32 mtu;
 205
 206         if (opt) {
 207                 unsigned int head_room;
 208
 209                 /* First: exthdrs may take lots of space (~8K for now)
 210                    MAX_HEADER is not enough.
 211                  */
 212                 head_room = opt->opt_nflen + opt->opt_flen;
 213                 seg_len += head_room;
 214                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 215
 216                 if (skb_headroom(skb) < head_room) {
 217                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 218                         if (skb2 == NULL) {
 219                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 220                                               IPSTATS_MIB_OUTDISCARDS);
 221                                 kfree_skb(skb);
 222                                 return -ENOBUFS;
 223                         }
 224                         kfree_skb(skb);
 225                         skb = skb2;
 226                         if (sk)
 227                                 skb_set_owner_w(skb, sk);
 228                 }
 229                 if (opt->opt_flen)
 230                         ipv6_push_frag_opts(skb, opt, &proto);
 231                 if (opt->opt_nflen)
 232                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 233         }
 234
 235         skb_push(skb, sizeof(struct ipv6hdr));
 236         skb_reset_network_header(skb);
 237         hdr = ipv6_hdr(skb);
 238
 239         /* Allow local fragmentation. */
 240         if (ipfragok)
 241                 skb->local_df = 1;
 242
 243         /*
 244          *      Fill in the IPv6 header
 245          */
 246
 247         hlimit = -1;
 248         if (np)
 249                 hlimit = np->hop_limit;
 250         if (hlimit < 0)
 251                 hlimit = ip6_dst_hoplimit(dst);
 252
 253         tclass = -1;
 254         if (np)
 255                 tclass = np->tclass;
 256         if (tclass < 0)
 257                 tclass = 0;
 258
 259         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 260
 261         hdr->payload_len = htons(seg_len);
 262         hdr->nexthdr = proto;
 263         hdr->hop_limit = hlimit;
 264
 265         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 266         ipv6_addr_copy(&hdr->daddr, first_hop);
 267
 268         skb->priority = sk->sk_priority;
 269         skb->mark = sk->sk_mark;
 270
 271         mtu = dst_mtu(dst);
 272         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 273                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 274                               IPSTATS_MIB_OUTREQUESTS);
 275                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 276                                 dst_output);
 277         }
 278
 279         if (net_ratelimit())
 280                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 281         skb->dev = dst->dev;
 282         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 283         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 284         kfree_skb(skb);
 285         return -EMSGSIZE;
 286 }
 287
 288 EXPORT_SYMBOL(ip6_xmit);
 289
 290 /*
 291  *      To avoid extra problems ND packets are send through this
 292  *      routine. It's code duplication but I really want to avoid
 293  *      extra checks since ipv6_build_header is used by TCP (which
 294  *      is for us performance critical)
 295  */
 296
 297 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 298                const struct in6_addr *saddr, const struct in6_addr *daddr,
 299                int proto, int len)
 300 {
 301         struct ipv6_pinfo *np = inet6_sk(sk);
 302         struct ipv6hdr *hdr;
 303         int totlen;
 304
 305         skb->protocol = htons(ETH_P_IPV6);
 306         skb->dev = dev;
 307
 308         totlen = len + sizeof(struct ipv6hdr);
 309
 310         skb_reset_network_header(skb);
 311         skb_put(skb, sizeof(struct ipv6hdr));
 312         hdr = ipv6_hdr(skb);
 313
 314         *(__be32*)hdr = htonl(0x60000000);
 315
 316         hdr->payload_len = htons(len);
 317         hdr->nexthdr = proto;
 318         hdr->hop_limit = np->hop_limit;
 319
 320         ipv6_addr_copy(&hdr->saddr, saddr);
 321         ipv6_addr_copy(&hdr->daddr, daddr);
 322
 323         return 0;
 324 }
 325
 326 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 327 {
 328         struct ip6_ra_chain *ra;
 329         struct sock *last = NULL;
 330
 331         read_lock(&ip6_ra_lock);
 332         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 333                 struct sock *sk = ra->sk;
 334                 if (sk && ra->sel == sel &&
 335                     (!sk->sk_bound_dev_if ||
 336                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 337                         if (last) {
 338                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 339                                 if (skb2)
 340                                         rawv6_rcv(last, skb2);
 341                         }
 342                         last = sk;
 343                 }
 344         }
 345
 346         if (last) {
 347                 rawv6_rcv(last, skb);
 348                 read_unlock(&ip6_ra_lock);
 349                 return 1;
 350         }
 351         read_unlock(&ip6_ra_lock);
 352         return 0;
 353 }
 354
 355 static int ip6_forward_proxy_check(struct sk_buff *skb)
 356 {
 357         struct ipv6hdr *hdr = ipv6_hdr(skb);
 358         u8 nexthdr = hdr->nexthdr;
 359         int offset;
 360
 361         if (ipv6_ext_hdr(nexthdr)) {
 362                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 363                 if (offset < 0)
 364                         return 0;
 365         } else
 366                 offset = sizeof(struct ipv6hdr);
 367
 368         if (nexthdr == IPPROTO_ICMPV6) {
 369                 struct icmp6hdr *icmp6;
 370
 371                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 372                                          offset + 1 - skb->data)))
 373                         return 0;
 374
 375                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 376
 377                 switch (icmp6->icmp6_type) {
 378                 case NDISC_ROUTER_SOLICITATION:
 379                 case NDISC_ROUTER_ADVERTISEMENT:
 380                 case NDISC_NEIGHBOUR_SOLICITATION:
 381                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 382                 case NDISC_REDIRECT:
 383                         /* For reaction involving unicast neighbor discovery
 384                          * message destined to the proxied address, pass it to
 385                          * input function.
 386                          */
 387                         return 1;
 388                 default:
 389                         break;
 390                 }
 391         }
 392
 393         /*
 394          * The proxying router can't forward traffic sent to a link-local
 395          * address, so signal the sender and discard the packet. This
 396          * behavior is clarified by the MIPv6 specification.
 397          */
 398         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 399                 dst_link_failure(skb);
 400                 return -1;
 401         }
 402
 403         return 0;
 404 }
 405
 406 static inline int ip6_forward_finish(struct sk_buff *skb)
 407 {
 408         return dst_output(skb);
 409 }
 410
 411 int ip6_forward(struct sk_buff *skb)
 412 {
 413         struct dst_entry *dst = skb->dst;
 414         struct ipv6hdr *hdr = ipv6_hdr(skb);
 415         struct inet6_skb_parm *opt = IP6CB(skb);
 416         struct net *net = dev_net(dst->dev);
 417
 418         if (net->ipv6.devconf_all->forwarding == 0)
 419                 goto error;
 420
 421         if (skb_warn_if_lro(skb))
 422                 goto drop;
 423
 424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 426                 goto drop;
 427         }
 428
 429         skb_forward_csum(skb);
 430
 431         /*
 432          *      We DO NOT make any processing on
 433          *      RA packets, pushing them to user level AS IS
 434          *      without ane WARRANTY that application will be able
 435          *      to interpret them. The reason is that we
 436          *      cannot make anything clever here.
 437          *
 438          *      We are not end-node, so that if packet contains
 439          *      AH/ESP, we cannot make anything.
 440          *      Defragmentation also would be mistake, RA packets
 441          *      cannot be fragmented, because there is no warranty
 442          *      that different fragments will go along one path. --ANK
 443          */
 444         if (opt->ra) {
 445                 u8 *ptr = skb_network_header(skb) + opt->ra;
 446                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 447                         return 0;
 448         }
 449
 450         /*
 451          *      check and decrement ttl
 452          */
 453         if (hdr->hop_limit <= 1) {
 454                 /* Force OUTPUT device used as source address */
 455                 skb->dev = dst->dev;
 456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 457                             0, skb->dev);
 458                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 459
 460                 kfree_skb(skb);
 461                 return -ETIMEDOUT;
 462         }
 463
 464         /* XXX: idev->cnf.proxy_ndp? */
 465         if (net->ipv6.devconf_all->proxy_ndp &&
 466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 467                 int proxied = ip6_forward_proxy_check(skb);
 468                 if (proxied > 0)
 469                         return ip6_input(skb);
 470                 else if (proxied < 0) {
 471                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 472                         goto drop;
 473                 }
 474         }
 475
 476         if (!xfrm6_route_forward(skb)) {
 477                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 478                 goto drop;
 479         }
 480         dst = skb->dst;
 481
 482         /* IPv6 specs say nothing about it, but it is clear that we cannot
 483            send redirects to source routed frames.
 484            We don't send redirects to frames decapsulated from IPsec.
 485          */
 486         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 487             !skb->sp) {
 488                 struct in6_addr *target = NULL;
 489                 struct rt6_info *rt;
 490                 struct neighbour *n = dst->neighbour;
 491
 492                 /*
 493                  *      incoming and outgoing devices are the same
 494                  *      send a redirect.
 495                  */
 496
 497                 rt = (struct rt6_info *) dst;
 498                 if ((rt->rt6i_flags & RTF_GATEWAY))
 499                         target = (struct in6_addr*)&n->primary_key;
 500                 else
 501                         target = &hdr->daddr;
 502
 503                 /* Limit redirects both by destination (here)
 504                    and by source (inside ndisc_send_redirect)
 505                  */
 506                 if (xrlim_allow(dst, 1*HZ))
 507                         ndisc_send_redirect(skb, n, target);
 508         } else {
 509                 int addrtype = ipv6_addr_type(&hdr->saddr);
 510
 511                 /* This check is security critical. */
 512                 if (addrtype == IPV6_ADDR_ANY ||
 513                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 514                         goto error;
 515                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 516                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 517                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 518                         goto error;
 519                 }
 520         }
 521
 522         if (skb->len > dst_mtu(dst)) {
 523                 /* Again, force OUTPUT device used as source address */
 524                 skb->dev = dst->dev;
 525                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 526                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 527                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 528                 kfree_skb(skb);
 529                 return -EMSGSIZE;
 530         }
 531
 532         if (skb_cow(skb, dst->dev->hard_header_len)) {
 533                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 534                 goto drop;
 535         }
 536
 537         hdr = ipv6_hdr(skb);
 538
 539         /* Mangling hops number delayed to point after skb COW */
 540
 541         hdr->hop_limit--;
 542
 543         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 544         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 545                        ip6_forward_finish);
 546
 547 error:
 548         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 549 drop:
 550         kfree_skb(skb);
 551         return -EINVAL;
 552 }
 553
 554 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 555 {
 556         to->pkt_type = from->pkt_type;
 557         to->priority = from->priority;
 558         to->protocol = from->protocol;
 559         dst_release(to->dst);
 560         to->dst = dst_clone(from->dst);
 561         to->dev = from->dev;
 562         to->mark = from->mark;
 563
 564 #ifdef CONFIG_NET_SCHED
 565         to->tc_index = from->tc_index;
 566 #endif
 567         nf_copy(to, from);
 568 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 569     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 570         to->nf_trace = from->nf_trace;
 571 #endif
 572         skb_copy_secmark(to, from);
 573 }
 574
 575 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 576 {
 577         u16 offset = sizeof(struct ipv6hdr);
 578         struct ipv6_opt_hdr *exthdr =
 579                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 580         unsigned int packet_len = skb->tail - skb->network_header;
 581         int found_rhdr = 0;
 582         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 583
 584         while (offset + 1 <= packet_len) {
 585
 586                 switch (**nexthdr) {
 587
 588                 case NEXTHDR_HOP:
 589                         break;
 590                 case NEXTHDR_ROUTING:
 591                         found_rhdr = 1;
 592                         break;
 593                 case NEXTHDR_DEST:
 594 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 595                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 596                                 break;
 597 #endif
 598                         if (found_rhdr)
 599                                 return offset;
 600                         break;
 601                 default :
 602                         return offset;
 603                 }
 604
 605                 offset += ipv6_optlen(exthdr);
 606                 *nexthdr = &exthdr->nexthdr;
 607                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 608                                                  offset);
 609         }
 610
 611         return offset;
 612 }
 613
 614 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 615 {
 616         struct net_device *dev;
 617         struct sk_buff *frag;
 618         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 619         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 620         struct ipv6hdr *tmp_hdr;
 621         struct frag_hdr *fh;
 622         unsigned int mtu, hlen, left, len;
 623         __be32 frag_id = 0;
 624         int ptr, offset = 0, err=0;
 625         u8 *prevhdr, nexthdr = 0;
 626
 627         dev = rt->u.dst.dev;
 628         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 629         nexthdr = *prevhdr;
 630
 631         mtu = ip6_skb_dst_mtu(skb);
 632
 633         /* We must not fragment if the socket is set to force MTU discovery
 634          * or if the skb it not generated by a local socket.  (This last
 635          * check should be redundant, but it's free.)
 636          */
 637         if (!skb->local_df) {
 638                 skb->dev = skb->dst->dev;
 639                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 640                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 641                 kfree_skb(skb);
 642                 return -EMSGSIZE;
 643         }
 644
 645         if (np && np->frag_size < mtu) {
 646                 if (np->frag_size)
 647                         mtu = np->frag_size;
 648         }
 649         mtu -= hlen + sizeof(struct frag_hdr);
 650
 651         if (skb_shinfo(skb)->frag_list) {
 652                 int first_len = skb_pagelen(skb);
 653                 int truesizes = 0;
 654
 655                 if (first_len - hlen > mtu ||
 656                     ((first_len - hlen) & 7) ||
 657                     skb_cloned(skb))
 658                         goto slow_path;
 659
 660                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 661                         /* Correct geometry. */
 662                         if (frag->len > mtu ||
 663                             ((frag->len & 7) && frag->next) ||
 664                             skb_headroom(frag) < hlen)
 665                             goto slow_path;
 666
 667                         /* Partially cloned skb? */
 668                         if (skb_shared(frag))
 669                                 goto slow_path;
 670
 671                         BUG_ON(frag->sk);
 672                         if (skb->sk) {
 673                                 sock_hold(skb->sk);
 674                                 frag->sk = skb->sk;
 675                                 frag->destructor = sock_wfree;
 676                                 truesizes += frag->truesize;
 677                         }
 678                 }
 679
 680                 err = 0;
 681                 offset = 0;
 682                 frag = skb_shinfo(skb)->frag_list;
 683                 skb_shinfo(skb)->frag_list = NULL;
 684                 /* BUILD HEADER */
 685
 686                 *prevhdr = NEXTHDR_FRAGMENT;
 687                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 688                 if (!tmp_hdr) {
 689                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 690                         return -ENOMEM;
 691                 }
 692
 693                 __skb_pull(skb, hlen);
 694                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 695                 __skb_push(skb, hlen);
 696                 skb_reset_network_header(skb);
 697                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 698
 699                 ipv6_select_ident(skb, fh);
 700                 fh->nexthdr = nexthdr;
 701                 fh->reserved = 0;
 702                 fh->frag_off = htons(IP6_MF);
 703                 frag_id = fh->identification;
 704
 705                 first_len = skb_pagelen(skb);
 706                 skb->data_len = first_len - skb_headlen(skb);
 707                 skb->truesize -= truesizes;
 708                 skb->len = first_len;
 709                 ipv6_hdr(skb)->payload_len = htons(first_len -
 710                                                    sizeof(struct ipv6hdr));
 711
 712                 dst_hold(&rt->u.dst);
 713
 714                 for (;;) {
 715                         /* Prepare header of the next frame,
 716                          * before previous one went down. */
 717                         if (frag) {
 718                                 frag->ip_summed = CHECKSUM_NONE;
 719                                 skb_reset_transport_header(frag);
 720                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 721                                 __skb_push(frag, hlen);
 722                                 skb_reset_network_header(frag);
 723                                 memcpy(skb_network_header(frag), tmp_hdr,
 724                                        hlen);
 725                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 726                                 fh->nexthdr = nexthdr;
 727                                 fh->reserved = 0;
 728                                 fh->frag_off = htons(offset);
 729                                 if (frag->next != NULL)
 730                                         fh->frag_off |= htons(IP6_MF);
 731                                 fh->identification = frag_id;
 732                                 ipv6_hdr(frag)->payload_len =
 733                                                 htons(frag->len -
 734                                                       sizeof(struct ipv6hdr));
 735                                 ip6_copy_metadata(frag, skb);
 736                         }
 737
 738                         err = output(skb);
 739                         if(!err)
 740                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 741
 742                         if (err || !frag)
 743                                 break;
 744
 745                         skb = frag;
 746                         frag = skb->next;
 747                         skb->next = NULL;
 748                 }
 749
 750                 kfree(tmp_hdr);
 751
 752                 if (err == 0) {
 753                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 754                         dst_release(&rt->u.dst);
 755                         return 0;
 756                 }
 757
 758                 while (frag) {
 759                         skb = frag->next;
 760                         kfree_skb(frag);
 761                         frag = skb;
 762                 }
 763
 764                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 765                 dst_release(&rt->u.dst);
 766                 return err;
 767         }
 768
 769 slow_path:
 770         left = skb->len - hlen;         /* Space per frame */
 771         ptr = hlen;                     /* Where to start from */
 772
 773         /*
 774          *      Fragment the datagram.
 775          */
 776
 777         *prevhdr = NEXTHDR_FRAGMENT;
 778
 779         /*
 780          *      Keep copying data until we run out.
 781          */
 782         while(left > 0) {
 783                 len = left;
 784                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 785                 if (len > mtu)
 786                         len = mtu;
 787                 /* IF: we are not sending upto and including the packet end
 788                    then align the next start on an eight byte boundary */
 789                 if (len < left) {
 790                         len &= ~7;
 791                 }
 792                 /*
 793                  *      Allocate buffer.
 794                  */
 795
 796                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 797                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 798                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 799                                       IPSTATS_MIB_FRAGFAILS);
 800                         err = -ENOMEM;
 801                         goto fail;
 802                 }
 803
 804                 /*
 805                  *      Set up data on packet
 806                  */
 807
 808                 ip6_copy_metadata(frag, skb);
 809                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 810                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 811                 skb_reset_network_header(frag);
 812                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 813                 frag->transport_header = (frag->network_header + hlen +
 814                                           sizeof(struct frag_hdr));
 815
 816                 /*
 817                  *      Charge the memory for the fragment to any owner
 818                  *      it might possess
 819                  */
 820                 if (skb->sk)
 821                         skb_set_owner_w(frag, skb->sk);
 822
 823                 /*
 824                  *      Copy the packet header into the new buffer.
 825                  */
 826                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 827
 828                 /*
 829                  *      Build fragment header.
 830                  */
 831                 fh->nexthdr = nexthdr;
 832                 fh->reserved = 0;
 833                 if (!frag_id) {
 834                         ipv6_select_ident(skb, fh);
 835                         frag_id = fh->identification;
 836                 } else
 837                         fh->identification = frag_id;
 838
 839                 /*
 840                  *      Copy a block of the IP datagram.
 841                  */
 842                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 843                         BUG();
 844                 left -= len;
 845
 846                 fh->frag_off = htons(offset);
 847                 if (left > 0)
 848                         fh->frag_off |= htons(IP6_MF);
 849                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 850                                                     sizeof(struct ipv6hdr));
 851
 852                 ptr += len;
 853                 offset += len;
 854
 855                 /*
 856                  *      Put this fragment into the sending queue.
 857                  */
 858                 err = output(frag);
 859                 if (err)
 860                         goto fail;
 861
 862                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 863         }
 864         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 865                       IPSTATS_MIB_FRAGOKS);
 866         kfree_skb(skb);
 867         return err;
 868
 869 fail:
 870         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 871                       IPSTATS_MIB_FRAGFAILS);
 872         kfree_skb(skb);
 873         return err;
 874 }
 875
 876 static inline int ip6_rt_check(struct rt6key *rt_key,
 877                                struct in6_addr *fl_addr,
 878                                struct in6_addr *addr_cache)
 879 {
 880         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 881                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 882 }
 883
 884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 885                                           struct dst_entry *dst,
 886                                           struct flowi *fl)
 887 {
 888         struct ipv6_pinfo *np = inet6_sk(sk);
 889         struct rt6_info *rt = (struct rt6_info *)dst;
 890
 891         if (!dst)
 892                 goto out;
 893
 894         /* Yes, checking route validity in not connected
 895          * case is not very simple. Take into account,
 896          * that we do not support routing by source, TOS,
 897          * and MSG_DONTROUTE            --ANK (980726)
 898          *
 899          * 1. ip6_rt_check(): If route was host route,
 900          *    check that cached destination is current.
 901          *    If it is network route, we still may
 902          *    check its validity using saved pointer
 903          *    to the last used address: daddr_cache.
 904          *    We do not want to save whole address now,
 905          *    (because main consumer of this service
 906          *    is tcp, which has not this problem),
 907          *    so that the last trick works only on connected
 908          *    sockets.
 909          * 2. oif also should be the same.
 910          */
 911         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 912 #ifdef CONFIG_IPV6_SUBTREES
 913             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 914 #endif
 915             (fl->oif && fl->oif != dst->dev->ifindex)) {
 916                 dst_release(dst);
 917                 dst = NULL;
 918         }
 919
 920 out:
 921         return dst;
 922 }
 923
 924 static int ip6_dst_lookup_tail(struct sock *sk,
 925                                struct dst_entry **dst, struct flowi *fl)
 926 {
 927         int err;
 928         struct net *net = sock_net(sk);
 929
 930         if (*dst == NULL)
 931                 *dst = ip6_route_output(net, sk, fl);
 932
 933         if ((err = (*dst)->error))
 934                 goto out_err_release;
 935
 936         if (ipv6_addr_any(&fl->fl6_src)) {
 937                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 938                                          &fl->fl6_dst,
 939                                          sk ? inet6_sk(sk)->srcprefs : 0,
 940                                          &fl->fl6_src);
 941                 if (err)
 942                         goto out_err_release;
 943         }
 944
 945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 946         /*
 947          * Here if the dst entry we've looked up
 948          * has a neighbour entry that is in the INCOMPLETE
 949          * state and the src address from the flow is
 950          * marked as OPTIMISTIC, we release the found
 951          * dst entry and replace it instead with the
 952          * dst entry of the nexthop router
 953          */
 954         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 955                 struct inet6_ifaddr *ifp;
 956                 struct flowi fl_gw;
 957                 int redirect;
 958
 959                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 960                                       (*dst)->dev, 1);
 961
 962                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 963                 if (ifp)
 964                         in6_ifa_put(ifp);
 965
 966                 if (redirect) {
 967                         /*
 968                          * We need to get the dst entry for the
 969                          * default router instead
 970                          */
 971                         dst_release(*dst);
 972                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 973                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 974                         *dst = ip6_route_output(net, sk, &fl_gw);
 975                         if ((err = (*dst)->error))
 976                                 goto out_err_release;
 977                 }
 978         }
 979 #endif
 980
 981         return 0;
 982
 983 out_err_release:
 984         if (err == -ENETUNREACH)
 985                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 986         dst_release(*dst);
 987         *dst = NULL;
 988         return err;
 989 }
 990
 991 /**
 992  *      ip6_dst_lookup - perform route lookup on flow
 993  *      @sk: socket which provides route info
 994  *      @dst: pointer to dst_entry * for result
 995  *      @fl: flow to lookup
 996  *
 997  *      This function performs a route lookup on the given flow.
 998  *
 999  *      It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 {
1003         *dst = NULL;
1004         return ip6_dst_lookup_tail(sk, dst, fl);
1005 }
1006 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007
1008 /**
1009  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010  *      @sk: socket which provides the dst cache and route info
1011  *      @dst: pointer to dst_entry * for result
1012  *      @fl: flow to lookup
1013  *
1014  *      This function performs a route lookup on the given flow with the
1015  *      possibility of using the cached route in the socket if it is valid.
1016  *      It will take the socket dst lock when operating on the dst cache.
1017  *      As a result, this function can only be used in process context.
1018  *
1019  *      It returns zero on success, or a standard errno code on error.
1020  */
1021 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022 {
1023         *dst = NULL;
1024         if (sk) {
1025                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1027         }
1028
1029         return ip6_dst_lookup_tail(sk, dst, fl);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032
1033 static inline int ip6_ufo_append_data(struct sock *sk,
1034                         int getfrag(void *from, char *to, int offset, int len,
1035                         int odd, struct sk_buff *skb),
1036                         void *from, int length, int hh_len, int fragheaderlen,
1037                         int transhdrlen, int mtu,unsigned int flags)
1038
1039 {
1040         struct sk_buff *skb;
1041         int err;
1042
1043         /* There is support for UDP large send offload by network
1044          * device, so create one single skb packet containing complete
1045          * udp datagram
1046          */
1047         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048                 skb = sock_alloc_send_skb(sk,
1049                         hh_len + fragheaderlen + transhdrlen + 20,
1050                         (flags & MSG_DONTWAIT), &err);
1051                 if (skb == NULL)
1052                         return -ENOMEM;
1053
1054                 /* reserve space for Hardware header */
1055                 skb_reserve(skb, hh_len);
1056
1057                 /* create space for UDP/IP header */
1058                 skb_put(skb,fragheaderlen + transhdrlen);
1059
1060                 /* initialize network header pointer */
1061                 skb_reset_network_header(skb);
1062
1063                 /* initialize protocol header pointer */
1064                 skb->transport_header = skb->network_header + fragheaderlen;
1065
1066                 skb->ip_summed = CHECKSUM_PARTIAL;
1067                 skb->csum = 0;
1068                 sk->sk_sndmsg_off = 0;
1069         }
1070
1071         err = skb_append_datato_frags(sk,skb, getfrag, from,
1072                                       (length - transhdrlen));
1073         if (!err) {
1074                 struct frag_hdr fhdr;
1075
1076                 /* specify the length of each IP datagram fragment*/
1077                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1078                                             sizeof(struct frag_hdr);
1079                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1080                 ipv6_select_ident(skb, &fhdr);
1081                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1082                 __skb_queue_tail(&sk->sk_write_queue, skb);
1083
1084                 return 0;
1085         }
1086         /* There is not enough support do UPD LSO,
1087          * so follow normal path
1088          */
1089         kfree_skb(skb);
1090
1091         return err;
1092 }
1093
1094 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1095                                                gfp_t gfp)
1096 {
1097         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1098 }
1099
1100 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1101                                                 gfp_t gfp)
1102 {
1103         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1104 }
1105
1106 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1107         int offset, int len, int odd, struct sk_buff *skb),
1108         void *from, int length, int transhdrlen,
1109         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1110         struct rt6_info *rt, unsigned int flags)
1111 {
1112         struct inet_sock *inet = inet_sk(sk);
1113         struct ipv6_pinfo *np = inet6_sk(sk);
1114         struct sk_buff *skb;
1115         unsigned int maxfraglen, fragheaderlen;
1116         int exthdrlen;
1117         int hh_len;
1118         int mtu;
1119         int copy;
1120         int err;
1121         int offset = 0;
1122         int csummode = CHECKSUM_NONE;
1123
1124         if (flags&MSG_PROBE)
1125                 return 0;
1126         if (skb_queue_empty(&sk->sk_write_queue)) {
1127                 /*
1128                  * setup for corking
1129                  */
1130                 if (opt) {
1131                         if (WARN_ON(np->cork.opt))
1132                                 return -EINVAL;
1133
1134                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1135                         if (unlikely(np->cork.opt == NULL))
1136                                 return -ENOBUFS;
1137
1138                         np->cork.opt->tot_len = opt->tot_len;
1139                         np->cork.opt->opt_flen = opt->opt_flen;
1140                         np->cork.opt->opt_nflen = opt->opt_nflen;
1141
1142                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1143                                                             sk->sk_allocation);
1144                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1145                                 return -ENOBUFS;
1146
1147                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1148                                                             sk->sk_allocation);
1149                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1150                                 return -ENOBUFS;
1151
1152                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1153                                                            sk->sk_allocation);
1154                         if (opt->hopopt && !np->cork.opt->hopopt)
1155                                 return -ENOBUFS;
1156
1157                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1158                                                             sk->sk_allocation);
1159                         if (opt->srcrt && !np->cork.opt->srcrt)
1160                                 return -ENOBUFS;
1161
1162                         /* need source address above miyazawa*/
1163                 }
1164                 dst_hold(&rt->u.dst);
1165                 inet->cork.dst = &rt->u.dst;
1166                 inet->cork.fl = *fl;
1167                 np->cork.hop_limit = hlimit;
1168                 np->cork.tclass = tclass;
1169                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1170                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1171                 if (np->frag_size < mtu) {
1172                         if (np->frag_size)
1173                                 mtu = np->frag_size;
1174                 }
1175                 inet->cork.fragsize = mtu;
1176                 if (dst_allfrag(rt->u.dst.path))
1177                         inet->cork.flags |= IPCORK_ALLFRAG;
1178                 inet->cork.length = 0;
1179                 sk->sk_sndmsg_page = NULL;
1180                 sk->sk_sndmsg_off = 0;
1181                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1182                             rt->rt6i_nfheader_len;
1183                 length += exthdrlen;
1184                 transhdrlen += exthdrlen;
1185         } else {
1186                 rt = (struct rt6_info *)inet->cork.dst;
1187                 fl = &inet->cork.fl;
1188                 opt = np->cork.opt;
1189                 transhdrlen = 0;
1190                 exthdrlen = 0;
1191                 mtu = inet->cork.fragsize;
1192         }
1193
1194         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1195
1196         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1197                         (opt ? opt->opt_nflen : 0);
1198         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1199
1200         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1201                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1202                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1203                         return -EMSGSIZE;
1204                 }
1205         }
1206
1207         /*
1208          * Let's try using as much space as possible.
1209          * Use MTU if total length of the message fits into the MTU.
1210          * Otherwise, we need to reserve fragment header and
1211          * fragment alignment (= 8-15 octects, in total).
1212          *
1213          * Note that we may need to "move" the data from the tail of
1214          * of the buffer to the new fragment when we split
1215          * the message.
1216          *
1217          * FIXME: It may be fragmented into multiple chunks
1218          *        at once if non-fragmentable extension headers
1219          *        are too large.
1220          * --yoshfuji
1221          */
1222
1223         inet->cork.length += length;
1224         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1225             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1226
1227                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1228                                           fragheaderlen, transhdrlen, mtu,
1229                                           flags);
1230                 if (err)
1231                         goto error;
1232                 return 0;
1233         }
1234
1235         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1236                 goto alloc_new_skb;
1237
1238         while (length > 0) {
1239                 /* Check if the remaining data fits into current packet. */
1240                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1241                 if (copy < length)
1242                         copy = maxfraglen - skb->len;
1243
1244                 if (copy <= 0) {
1245                         char *data;
1246                         unsigned int datalen;
1247                         unsigned int fraglen;
1248                         unsigned int fraggap;
1249                         unsigned int alloclen;
1250                         struct sk_buff *skb_prev;
1251 alloc_new_skb:
1252                         skb_prev = skb;
1253
1254                         /* There's no room in the current skb */
1255                         if (skb_prev)
1256                                 fraggap = skb_prev->len - maxfraglen;
1257                         else
1258                                 fraggap = 0;
1259
1260                         /*
1261                          * If remaining data exceeds the mtu,
1262                          * we know we need more fragment(s).
1263                          */
1264                         datalen = length + fraggap;
1265                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1266                                 datalen = maxfraglen - fragheaderlen;
1267
1268                         fraglen = datalen + fragheaderlen;
1269                         if ((flags & MSG_MORE) &&
1270                             !(rt->u.dst.dev->features&NETIF_F_SG))
1271                                 alloclen = mtu;
1272                         else
1273                                 alloclen = datalen + fragheaderlen;
1274
1275                         /*
1276                          * The last fragment gets additional space at tail.
1277                          * Note: we overallocate on fragments with MSG_MODE
1278                          * because we have no idea if we're the last one.
1279                          */
1280                         if (datalen == length + fraggap)
1281                                 alloclen += rt->u.dst.trailer_len;
1282
1283                         /*
1284                          * We just reserve space for fragment header.
1285                          * Note: this may be overallocation if the message
1286                          * (without MSG_MORE) fits into the MTU.
1287                          */
1288                         alloclen += sizeof(struct frag_hdr);
1289
1290                         if (transhdrlen) {
1291                                 skb = sock_alloc_send_skb(sk,
1292                                                 alloclen + hh_len,
1293                                                 (flags & MSG_DONTWAIT), &err);
1294                         } else {
1295                                 skb = NULL;
1296                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1297                                     2 * sk->sk_sndbuf)
1298                                         skb = sock_wmalloc(sk,
1299                                                            alloclen + hh_len, 1,
1300                                                            sk->sk_allocation);
1301                                 if (unlikely(skb == NULL))
1302                                         err = -ENOBUFS;
1303                         }
1304                         if (skb == NULL)
1305                                 goto error;
1306                         /*
1307                          *      Fill in the control structures
1308                          */
1309                         skb->ip_summed = csummode;
1310                         skb->csum = 0;
1311                         /* reserve for fragmentation */
1312                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1313
1314                         /*
1315                          *      Find where to start putting bytes
1316                          */
1317                         data = skb_put(skb, fraglen);
1318                         skb_set_network_header(skb, exthdrlen);
1319                         data += fragheaderlen;
1320                         skb->transport_header = (skb->network_header +
1321                                                  fragheaderlen);
1322                         if (fraggap) {
1323                                 skb->csum = skb_copy_and_csum_bits(
1324                                         skb_prev, maxfraglen,
1325                                         data + transhdrlen, fraggap, 0);
1326                                 skb_prev->csum = csum_sub(skb_prev->csum,
1327                                                           skb->csum);
1328                                 data += fraggap;
1329                                 pskb_trim_unique(skb_prev, maxfraglen);
1330                         }
1331                         copy = datalen - transhdrlen - fraggap;
1332                         if (copy < 0) {
1333                                 err = -EINVAL;
1334                                 kfree_skb(skb);
1335                                 goto error;
1336                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1337                                 err = -EFAULT;
1338                                 kfree_skb(skb);
1339                                 goto error;
1340                         }
1341
1342                         offset += copy;
1343                         length -= datalen - fraggap;
1344                         transhdrlen = 0;
1345                         exthdrlen = 0;
1346                         csummode = CHECKSUM_NONE;
1347
1348                         /*
1349                          * Put the packet on the pending queue
1350                          */
1351                         __skb_queue_tail(&sk->sk_write_queue, skb);
1352                         continue;
1353                 }
1354
1355                 if (copy > length)
1356                         copy = length;
1357
1358                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1359                         unsigned int off;
1360
1361                         off = skb->len;
1362                         if (getfrag(from, skb_put(skb, copy),
1363                                                 offset, copy, off, skb) < 0) {
1364                                 __skb_trim(skb, off);
1365                                 err = -EFAULT;
1366                                 goto error;
1367                         }
1368                 } else {
1369                         int i = skb_shinfo(skb)->nr_frags;
1370                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1371                         struct page *page = sk->sk_sndmsg_page;
1372                         int off = sk->sk_sndmsg_off;
1373                         unsigned int left;
1374
1375                         if (page && (left = PAGE_SIZE - off) > 0) {
1376                                 if (copy >= left)
1377                                         copy = left;
1378                                 if (page != frag->page) {
1379                                         if (i == MAX_SKB_FRAGS) {
1380                                                 err = -EMSGSIZE;
1381                                                 goto error;
1382                                         }
1383                                         get_page(page);
1384                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1385                                         frag = &skb_shinfo(skb)->frags[i];
1386                                 }
1387                         } else if(i < MAX_SKB_FRAGS) {
1388                                 if (copy > PAGE_SIZE)
1389                                         copy = PAGE_SIZE;
1390                                 page = alloc_pages(sk->sk_allocation, 0);
1391                                 if (page == NULL) {
1392                                         err = -ENOMEM;
1393                                         goto error;
1394                                 }
1395                                 sk->sk_sndmsg_page = page;
1396                                 sk->sk_sndmsg_off = 0;
1397
1398                                 skb_fill_page_desc(skb, i, page, 0, 0);
1399                                 frag = &skb_shinfo(skb)->frags[i];
1400                         } else {
1401                                 err = -EMSGSIZE;
1402                                 goto error;
1403                         }
1404                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1405                                 err = -EFAULT;
1406                                 goto error;
1407                         }
1408                         sk->sk_sndmsg_off += copy;
1409                         frag->size += copy;
1410                         skb->len += copy;
1411                         skb->data_len += copy;
1412                         skb->truesize += copy;
1413                         atomic_add(copy, &sk->sk_wmem_alloc);
1414                 }
1415                 offset += copy;
1416                 length -= copy;
1417         }
1418         return 0;
1419 error:
1420         inet->cork.length -= length;
1421         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1422         return err;
1423 }
1424
1425 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1426 {
1427         if (np->cork.opt) {
1428                 kfree(np->cork.opt->dst0opt);
1429                 kfree(np->cork.opt->dst1opt);
1430                 kfree(np->cork.opt->hopopt);
1431                 kfree(np->cork.opt->srcrt);
1432                 kfree(np->cork.opt);
1433                 np->cork.opt = NULL;
1434         }
1435
1436         if (inet->cork.dst) {
1437                 dst_release(inet->cork.dst);
1438                 inet->cork.dst = NULL;
1439                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1440         }
1441         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1442 }
1443
1444 int ip6_push_pending_frames(struct sock *sk)
1445 {
1446         struct sk_buff *skb, *tmp_skb;
1447         struct sk_buff **tail_skb;
1448         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1449         struct inet_sock *inet = inet_sk(sk);
1450         struct ipv6_pinfo *np = inet6_sk(sk);
1451         struct ipv6hdr *hdr;
1452         struct ipv6_txoptions *opt = np->cork.opt;
1453         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1454         struct flowi *fl = &inet->cork.fl;
1455         unsigned char proto = fl->proto;
1456         int err = 0;
1457
1458         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1459                 goto out;
1460         tail_skb = &(skb_shinfo(skb)->frag_list);
1461
1462         /* move skb->data to ip header from ext header */
1463         if (skb->data < skb_network_header(skb))
1464                 __skb_pull(skb, skb_network_offset(skb));
1465         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1466                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1467                 *tail_skb = tmp_skb;
1468                 tail_skb = &(tmp_skb->next);
1469                 skb->len += tmp_skb->len;
1470                 skb->data_len += tmp_skb->len;
1471                 skb->truesize += tmp_skb->truesize;
1472                 __sock_put(tmp_skb->sk);
1473                 tmp_skb->destructor = NULL;
1474                 tmp_skb->sk = NULL;
1475         }
1476
1477         /* Allow local fragmentation. */
1478         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1479                 skb->local_df = 1;
1480
1481         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1482         __skb_pull(skb, skb_network_header_len(skb));
1483         if (opt && opt->opt_flen)
1484                 ipv6_push_frag_opts(skb, opt, &proto);
1485         if (opt && opt->opt_nflen)
1486                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1487
1488         skb_push(skb, sizeof(struct ipv6hdr));
1489         skb_reset_network_header(skb);
1490         hdr = ipv6_hdr(skb);
1491
1492         *(__be32*)hdr = fl->fl6_flowlabel |
1493                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1494
1495         hdr->hop_limit = np->cork.hop_limit;
1496         hdr->nexthdr = proto;
1497         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1498         ipv6_addr_copy(&hdr->daddr, final_dst);
1499
1500         skb->priority = sk->sk_priority;
1501         skb->mark = sk->sk_mark;
1502
1503         skb->dst = dst_clone(&rt->u.dst);
1504         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1505         if (proto == IPPROTO_ICMPV6) {
1506                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1507
1508                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1509                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1510         }
1511
1512         err = ip6_local_out(skb);
1513         if (err) {
1514                 if (err > 0)
1515                         err = np->recverr ? net_xmit_errno(err) : 0;
1516                 if (err)
1517                         goto error;
1518         }
1519
1520 out:
1521         ip6_cork_release(inet, np);
1522         return err;
1523 error:
1524         goto out;
1525 }
1526
1527 void ip6_flush_pending_frames(struct sock *sk)
1528 {
1529         struct sk_buff *skb;
1530
1531         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1532                 if (skb->dst)
1533                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1534                                       IPSTATS_MIB_OUTDISCARDS);
1535                 kfree_skb(skb);
1536         }
1537
1538         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1539 }