net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(dev_net(dst->dev),
 107                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         WARN_ON(!newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 141                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 142                                          &ipv6_hdr(skb)->saddr))) {
 143                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 144
 145                         /* Do not check for IFF_ALLMULTI; multicast routing
 146                            is not supported in any case.
 147                          */
 148                         if (newskb)
 149                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 150                                         NULL, newskb->dev,
 151                                         ip6_dev_loopback_xmit);
 152
 153                         if (ipv6_hdr(skb)->hop_limit == 0) {
 154                                 IP6_INC_STATS(dev_net(dev), idev,
 155                                               IPSTATS_MIB_OUTDISCARDS);
 156                                 kfree_skb(skb);
 157                                 return 0;
 158                         }
 159                 }
 160
 161                 IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
 162         }
 163
 164         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 165                        ip6_output_finish);
 166 }
 167
 168 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 169 {
 170         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 171
 172         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 173                skb->dst->dev->mtu : dst_mtu(skb->dst);
 174 }
 175
 176 int ip6_output(struct sk_buff *skb)
 177 {
 178         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 179         if (unlikely(idev->cnf.disable_ipv6)) {
 180                 IP6_INC_STATS(dev_net(skb->dst->dev), idev,
 181                               IPSTATS_MIB_OUTDISCARDS);
 182                 kfree_skb(skb);
 183                 return 0;
 184         }
 185
 186         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 187                                 dst_allfrag(skb->dst))
 188                 return ip6_fragment(skb, ip6_output2);
 189         else
 190                 return ip6_output2(skb);
 191 }
 192
 193 /*
 194  *      xmit an sk_buff (used by TCP)
 195  */
 196
 197 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 198              struct ipv6_txoptions *opt, int ipfragok)
 199 {
 200         struct net *net = sock_net(sk);
 201         struct ipv6_pinfo *np = inet6_sk(sk);
 202         struct in6_addr *first_hop = &fl->fl6_dst;
 203         struct dst_entry *dst = skb->dst;
 204         struct ipv6hdr *hdr;
 205         u8  proto = fl->proto;
 206         int seg_len = skb->len;
 207         int hlimit, tclass;
 208         u32 mtu;
 209
 210         if (opt) {
 211                 unsigned int head_room;
 212
 213                 /* First: exthdrs may take lots of space (~8K for now)
 214                    MAX_HEADER is not enough.
 215                  */
 216                 head_room = opt->opt_nflen + opt->opt_flen;
 217                 seg_len += head_room;
 218                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 219
 220                 if (skb_headroom(skb) < head_room) {
 221                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 222                         if (skb2 == NULL) {
 223                                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 224                                               IPSTATS_MIB_OUTDISCARDS);
 225                                 kfree_skb(skb);
 226                                 return -ENOBUFS;
 227                         }
 228                         kfree_skb(skb);
 229                         skb = skb2;
 230                         if (sk)
 231                                 skb_set_owner_w(skb, sk);
 232                 }
 233                 if (opt->opt_flen)
 234                         ipv6_push_frag_opts(skb, opt, &proto);
 235                 if (opt->opt_nflen)
 236                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 237         }
 238
 239         skb_push(skb, sizeof(struct ipv6hdr));
 240         skb_reset_network_header(skb);
 241         hdr = ipv6_hdr(skb);
 242
 243         /* Allow local fragmentation. */
 244         if (ipfragok)
 245                 skb->local_df = 1;
 246
 247         /*
 248          *      Fill in the IPv6 header
 249          */
 250
 251         hlimit = -1;
 252         if (np)
 253                 hlimit = np->hop_limit;
 254         if (hlimit < 0)
 255                 hlimit = ip6_dst_hoplimit(dst);
 256
 257         tclass = -1;
 258         if (np)
 259                 tclass = np->tclass;
 260         if (tclass < 0)
 261                 tclass = 0;
 262
 263         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 264
 265         hdr->payload_len = htons(seg_len);
 266         hdr->nexthdr = proto;
 267         hdr->hop_limit = hlimit;
 268
 269         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 270         ipv6_addr_copy(&hdr->daddr, first_hop);
 271
 272         skb->priority = sk->sk_priority;
 273         skb->mark = sk->sk_mark;
 274
 275         mtu = dst_mtu(dst);
 276         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 277                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 278                               IPSTATS_MIB_OUTREQUESTS);
 279                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 280                                 dst_output);
 281         }
 282
 283         if (net_ratelimit())
 284                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 285         skb->dev = dst->dev;
 286         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 287         IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 288         kfree_skb(skb);
 289         return -EMSGSIZE;
 290 }
 291
 292 EXPORT_SYMBOL(ip6_xmit);
 293
 294 /*
 295  *      To avoid extra problems ND packets are send through this
 296  *      routine. It's code duplication but I really want to avoid
 297  *      extra checks since ipv6_build_header is used by TCP (which
 298  *      is for us performance critical)
 299  */
 300
 301 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 302                const struct in6_addr *saddr, const struct in6_addr *daddr,
 303                int proto, int len)
 304 {
 305         struct ipv6_pinfo *np = inet6_sk(sk);
 306         struct ipv6hdr *hdr;
 307         int totlen;
 308
 309         skb->protocol = htons(ETH_P_IPV6);
 310         skb->dev = dev;
 311
 312         totlen = len + sizeof(struct ipv6hdr);
 313
 314         skb_reset_network_header(skb);
 315         skb_put(skb, sizeof(struct ipv6hdr));
 316         hdr = ipv6_hdr(skb);
 317
 318         *(__be32*)hdr = htonl(0x60000000);
 319
 320         hdr->payload_len = htons(len);
 321         hdr->nexthdr = proto;
 322         hdr->hop_limit = np->hop_limit;
 323
 324         ipv6_addr_copy(&hdr->saddr, saddr);
 325         ipv6_addr_copy(&hdr->daddr, daddr);
 326
 327         return 0;
 328 }
 329
 330 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 331 {
 332         struct ip6_ra_chain *ra;
 333         struct sock *last = NULL;
 334
 335         read_lock(&ip6_ra_lock);
 336         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 337                 struct sock *sk = ra->sk;
 338                 if (sk && ra->sel == sel &&
 339                     (!sk->sk_bound_dev_if ||
 340                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 341                         if (last) {
 342                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 343                                 if (skb2)
 344                                         rawv6_rcv(last, skb2);
 345                         }
 346                         last = sk;
 347                 }
 348         }
 349
 350         if (last) {
 351                 rawv6_rcv(last, skb);
 352                 read_unlock(&ip6_ra_lock);
 353                 return 1;
 354         }
 355         read_unlock(&ip6_ra_lock);
 356         return 0;
 357 }
 358
 359 static int ip6_forward_proxy_check(struct sk_buff *skb)
 360 {
 361         struct ipv6hdr *hdr = ipv6_hdr(skb);
 362         u8 nexthdr = hdr->nexthdr;
 363         int offset;
 364
 365         if (ipv6_ext_hdr(nexthdr)) {
 366                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 367                 if (offset < 0)
 368                         return 0;
 369         } else
 370                 offset = sizeof(struct ipv6hdr);
 371
 372         if (nexthdr == IPPROTO_ICMPV6) {
 373                 struct icmp6hdr *icmp6;
 374
 375                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 376                                          offset + 1 - skb->data)))
 377                         return 0;
 378
 379                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 380
 381                 switch (icmp6->icmp6_type) {
 382                 case NDISC_ROUTER_SOLICITATION:
 383                 case NDISC_ROUTER_ADVERTISEMENT:
 384                 case NDISC_NEIGHBOUR_SOLICITATION:
 385                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 386                 case NDISC_REDIRECT:
 387                         /* For reaction involving unicast neighbor discovery
 388                          * message destined to the proxied address, pass it to
 389                          * input function.
 390                          */
 391                         return 1;
 392                 default:
 393                         break;
 394                 }
 395         }
 396
 397         /*
 398          * The proxying router can't forward traffic sent to a link-local
 399          * address, so signal the sender and discard the packet. This
 400          * behavior is clarified by the MIPv6 specification.
 401          */
 402         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 403                 dst_link_failure(skb);
 404                 return -1;
 405         }
 406
 407         return 0;
 408 }
 409
 410 static inline int ip6_forward_finish(struct sk_buff *skb)
 411 {
 412         return dst_output(skb);
 413 }
 414
 415 int ip6_forward(struct sk_buff *skb)
 416 {
 417         struct dst_entry *dst = skb->dst;
 418         struct ipv6hdr *hdr = ipv6_hdr(skb);
 419         struct inet6_skb_parm *opt = IP6CB(skb);
 420         struct net *net = dev_net(dst->dev);
 421
 422         if (net->ipv6.devconf_all->forwarding == 0)
 423                 goto error;
 424
 425         if (skb_warn_if_lro(skb))
 426                 goto drop;
 427
 428         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 429                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 430                 goto drop;
 431         }
 432
 433         skb_forward_csum(skb);
 434
 435         /*
 436          *      We DO NOT make any processing on
 437          *      RA packets, pushing them to user level AS IS
 438          *      without ane WARRANTY that application will be able
 439          *      to interpret them. The reason is that we
 440          *      cannot make anything clever here.
 441          *
 442          *      We are not end-node, so that if packet contains
 443          *      AH/ESP, we cannot make anything.
 444          *      Defragmentation also would be mistake, RA packets
 445          *      cannot be fragmented, because there is no warranty
 446          *      that different fragments will go along one path. --ANK
 447          */
 448         if (opt->ra) {
 449                 u8 *ptr = skb_network_header(skb) + opt->ra;
 450                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 451                         return 0;
 452         }
 453
 454         /*
 455          *      check and decrement ttl
 456          */
 457         if (hdr->hop_limit <= 1) {
 458                 /* Force OUTPUT device used as source address */
 459                 skb->dev = dst->dev;
 460                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 461                             0, skb->dev);
 462                 IP6_INC_STATS_BH(net,
 463                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 464
 465                 kfree_skb(skb);
 466                 return -ETIMEDOUT;
 467         }
 468
 469         /* XXX: idev->cnf.proxy_ndp? */
 470         if (net->ipv6.devconf_all->proxy_ndp &&
 471             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 472                 int proxied = ip6_forward_proxy_check(skb);
 473                 if (proxied > 0)
 474                         return ip6_input(skb);
 475                 else if (proxied < 0) {
 476                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 477                                       IPSTATS_MIB_INDISCARDS);
 478                         goto drop;
 479                 }
 480         }
 481
 482         if (!xfrm6_route_forward(skb)) {
 483                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 484                 goto drop;
 485         }
 486         dst = skb->dst;
 487
 488         /* IPv6 specs say nothing about it, but it is clear that we cannot
 489            send redirects to source routed frames.
 490            We don't send redirects to frames decapsulated from IPsec.
 491          */
 492         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 493             !skb->sp) {
 494                 struct in6_addr *target = NULL;
 495                 struct rt6_info *rt;
 496                 struct neighbour *n = dst->neighbour;
 497
 498                 /*
 499                  *      incoming and outgoing devices are the same
 500                  *      send a redirect.
 501                  */
 502
 503                 rt = (struct rt6_info *) dst;
 504                 if ((rt->rt6i_flags & RTF_GATEWAY))
 505                         target = (struct in6_addr*)&n->primary_key;
 506                 else
 507                         target = &hdr->daddr;
 508
 509                 /* Limit redirects both by destination (here)
 510                    and by source (inside ndisc_send_redirect)
 511                  */
 512                 if (xrlim_allow(dst, 1*HZ))
 513                         ndisc_send_redirect(skb, n, target);
 514         } else {
 515                 int addrtype = ipv6_addr_type(&hdr->saddr);
 516
 517                 /* This check is security critical. */
 518                 if (addrtype == IPV6_ADDR_ANY ||
 519                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 520                         goto error;
 521                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 522                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 523                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 524                         goto error;
 525                 }
 526         }
 527
 528         if (skb->len > dst_mtu(dst)) {
 529                 /* Again, force OUTPUT device used as source address */
 530                 skb->dev = dst->dev;
 531                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 532                 IP6_INC_STATS_BH(net,
 533                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 534                 IP6_INC_STATS_BH(net,
 535                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 536                 kfree_skb(skb);
 537                 return -EMSGSIZE;
 538         }
 539
 540         if (skb_cow(skb, dst->dev->hard_header_len)) {
 541                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 542                 goto drop;
 543         }
 544
 545         hdr = ipv6_hdr(skb);
 546
 547         /* Mangling hops number delayed to point after skb COW */
 548
 549         hdr->hop_limit--;
 550
 551         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 552         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 553                        ip6_forward_finish);
 554
 555 error:
 556         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 557 drop:
 558         kfree_skb(skb);
 559         return -EINVAL;
 560 }
 561
 562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 563 {
 564         to->pkt_type = from->pkt_type;
 565         to->priority = from->priority;
 566         to->protocol = from->protocol;
 567         dst_release(to->dst);
 568         to->dst = dst_clone(from->dst);
 569         to->dev = from->dev;
 570         to->mark = from->mark;
 571
 572 #ifdef CONFIG_NET_SCHED
 573         to->tc_index = from->tc_index;
 574 #endif
 575         nf_copy(to, from);
 576 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 577     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 578         to->nf_trace = from->nf_trace;
 579 #endif
 580         skb_copy_secmark(to, from);
 581 }
 582
 583 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 584 {
 585         u16 offset = sizeof(struct ipv6hdr);
 586         struct ipv6_opt_hdr *exthdr =
 587                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 588         unsigned int packet_len = skb->tail - skb->network_header;
 589         int found_rhdr = 0;
 590         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 591
 592         while (offset + 1 <= packet_len) {
 593
 594                 switch (**nexthdr) {
 595
 596                 case NEXTHDR_HOP:
 597                         break;
 598                 case NEXTHDR_ROUTING:
 599                         found_rhdr = 1;
 600                         break;
 601                 case NEXTHDR_DEST:
 602 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 603                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 604                                 break;
 605 #endif
 606                         if (found_rhdr)
 607                                 return offset;
 608                         break;
 609                 default :
 610                         return offset;
 611                 }
 612
 613                 offset += ipv6_optlen(exthdr);
 614                 *nexthdr = &exthdr->nexthdr;
 615                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 616                                                  offset);
 617         }
 618
 619         return offset;
 620 }
 621
 622 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 623 {
 624         struct sk_buff *frag;
 625         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 626         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 627         struct ipv6hdr *tmp_hdr;
 628         struct frag_hdr *fh;
 629         unsigned int mtu, hlen, left, len;
 630         __be32 frag_id = 0;
 631         int ptr, offset = 0, err=0;
 632         u8 *prevhdr, nexthdr = 0;
 633         struct net *net = dev_net(skb->dst->dev);
 634
 635         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 636         nexthdr = *prevhdr;
 637
 638         mtu = ip6_skb_dst_mtu(skb);
 639
 640         /* We must not fragment if the socket is set to force MTU discovery
 641          * or if the skb it not generated by a local socket.  (This last
 642          * check should be redundant, but it's free.)
 643          */
 644         if (!skb->local_df) {
 645                 skb->dev = skb->dst->dev;
 646                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 647                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 648                               IPSTATS_MIB_FRAGFAILS);
 649                 kfree_skb(skb);
 650                 return -EMSGSIZE;
 651         }
 652
 653         if (np && np->frag_size < mtu) {
 654                 if (np->frag_size)
 655                         mtu = np->frag_size;
 656         }
 657         mtu -= hlen + sizeof(struct frag_hdr);
 658
 659         if (skb_shinfo(skb)->frag_list) {
 660                 int first_len = skb_pagelen(skb);
 661                 int truesizes = 0;
 662
 663                 if (first_len - hlen > mtu ||
 664                     ((first_len - hlen) & 7) ||
 665                     skb_cloned(skb))
 666                         goto slow_path;
 667
 668                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 669                         /* Correct geometry. */
 670                         if (frag->len > mtu ||
 671                             ((frag->len & 7) && frag->next) ||
 672                             skb_headroom(frag) < hlen)
 673                             goto slow_path;
 674
 675                         /* Partially cloned skb? */
 676                         if (skb_shared(frag))
 677                                 goto slow_path;
 678
 679                         BUG_ON(frag->sk);
 680                         if (skb->sk) {
 681                                 sock_hold(skb->sk);
 682                                 frag->sk = skb->sk;
 683                                 frag->destructor = sock_wfree;
 684                                 truesizes += frag->truesize;
 685                         }
 686                 }
 687
 688                 err = 0;
 689                 offset = 0;
 690                 frag = skb_shinfo(skb)->frag_list;
 691                 skb_shinfo(skb)->frag_list = NULL;
 692                 /* BUILD HEADER */
 693
 694                 *prevhdr = NEXTHDR_FRAGMENT;
 695                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 696                 if (!tmp_hdr) {
 697                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 698                                       IPSTATS_MIB_FRAGFAILS);
 699                         return -ENOMEM;
 700                 }
 701
 702                 __skb_pull(skb, hlen);
 703                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 704                 __skb_push(skb, hlen);
 705                 skb_reset_network_header(skb);
 706                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 707
 708                 ipv6_select_ident(skb, fh);
 709                 fh->nexthdr = nexthdr;
 710                 fh->reserved = 0;
 711                 fh->frag_off = htons(IP6_MF);
 712                 frag_id = fh->identification;
 713
 714                 first_len = skb_pagelen(skb);
 715                 skb->data_len = first_len - skb_headlen(skb);
 716                 skb->truesize -= truesizes;
 717                 skb->len = first_len;
 718                 ipv6_hdr(skb)->payload_len = htons(first_len -
 719                                                    sizeof(struct ipv6hdr));
 720
 721                 dst_hold(&rt->u.dst);
 722
 723                 for (;;) {
 724                         /* Prepare header of the next frame,
 725                          * before previous one went down. */
 726                         if (frag) {
 727                                 frag->ip_summed = CHECKSUM_NONE;
 728                                 skb_reset_transport_header(frag);
 729                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 730                                 __skb_push(frag, hlen);
 731                                 skb_reset_network_header(frag);
 732                                 memcpy(skb_network_header(frag), tmp_hdr,
 733                                        hlen);
 734                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 735                                 fh->nexthdr = nexthdr;
 736                                 fh->reserved = 0;
 737                                 fh->frag_off = htons(offset);
 738                                 if (frag->next != NULL)
 739                                         fh->frag_off |= htons(IP6_MF);
 740                                 fh->identification = frag_id;
 741                                 ipv6_hdr(frag)->payload_len =
 742                                                 htons(frag->len -
 743                                                       sizeof(struct ipv6hdr));
 744                                 ip6_copy_metadata(frag, skb);
 745                         }
 746
 747                         err = output(skb);
 748                         if(!err)
 749                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 750                                               IPSTATS_MIB_FRAGCREATES);
 751
 752                         if (err || !frag)
 753                                 break;
 754
 755                         skb = frag;
 756                         frag = skb->next;
 757                         skb->next = NULL;
 758                 }
 759
 760                 kfree(tmp_hdr);
 761
 762                 if (err == 0) {
 763                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 764                                       IPSTATS_MIB_FRAGOKS);
 765                         dst_release(&rt->u.dst);
 766                         return 0;
 767                 }
 768
 769                 while (frag) {
 770                         skb = frag->next;
 771                         kfree_skb(frag);
 772                         frag = skb;
 773                 }
 774
 775                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 776                               IPSTATS_MIB_FRAGFAILS);
 777                 dst_release(&rt->u.dst);
 778                 return err;
 779         }
 780
 781 slow_path:
 782         left = skb->len - hlen;         /* Space per frame */
 783         ptr = hlen;                     /* Where to start from */
 784
 785         /*
 786          *      Fragment the datagram.
 787          */
 788
 789         *prevhdr = NEXTHDR_FRAGMENT;
 790
 791         /*
 792          *      Keep copying data until we run out.
 793          */
 794         while(left > 0) {
 795                 len = left;
 796                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 797                 if (len > mtu)
 798                         len = mtu;
 799                 /* IF: we are not sending upto and including the packet end
 800                    then align the next start on an eight byte boundary */
 801                 if (len < left) {
 802                         len &= ~7;
 803                 }
 804                 /*
 805                  *      Allocate buffer.
 806                  */
 807
 808                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 809                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 810                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 811                                       IPSTATS_MIB_FRAGFAILS);
 812                         err = -ENOMEM;
 813                         goto fail;
 814                 }
 815
 816                 /*
 817                  *      Set up data on packet
 818                  */
 819
 820                 ip6_copy_metadata(frag, skb);
 821                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 822                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 823                 skb_reset_network_header(frag);
 824                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 825                 frag->transport_header = (frag->network_header + hlen +
 826                                           sizeof(struct frag_hdr));
 827
 828                 /*
 829                  *      Charge the memory for the fragment to any owner
 830                  *      it might possess
 831                  */
 832                 if (skb->sk)
 833                         skb_set_owner_w(frag, skb->sk);
 834
 835                 /*
 836                  *      Copy the packet header into the new buffer.
 837                  */
 838                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 839
 840                 /*
 841                  *      Build fragment header.
 842                  */
 843                 fh->nexthdr = nexthdr;
 844                 fh->reserved = 0;
 845                 if (!frag_id) {
 846                         ipv6_select_ident(skb, fh);
 847                         frag_id = fh->identification;
 848                 } else
 849                         fh->identification = frag_id;
 850
 851                 /*
 852                  *      Copy a block of the IP datagram.
 853                  */
 854                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 855                         BUG();
 856                 left -= len;
 857
 858                 fh->frag_off = htons(offset);
 859                 if (left > 0)
 860                         fh->frag_off |= htons(IP6_MF);
 861                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 862                                                     sizeof(struct ipv6hdr));
 863
 864                 ptr += len;
 865                 offset += len;
 866
 867                 /*
 868                  *      Put this fragment into the sending queue.
 869                  */
 870                 err = output(frag);
 871                 if (err)
 872                         goto fail;
 873
 874                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 875                               IPSTATS_MIB_FRAGCREATES);
 876         }
 877         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 878                       IPSTATS_MIB_FRAGOKS);
 879         kfree_skb(skb);
 880         return err;
 881
 882 fail:
 883         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 884                       IPSTATS_MIB_FRAGFAILS);
 885         kfree_skb(skb);
 886         return err;
 887 }
 888
 889 static inline int ip6_rt_check(struct rt6key *rt_key,
 890                                struct in6_addr *fl_addr,
 891                                struct in6_addr *addr_cache)
 892 {
 893         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 894                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 895 }
 896
 897 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 898                                           struct dst_entry *dst,
 899                                           struct flowi *fl)
 900 {
 901         struct ipv6_pinfo *np = inet6_sk(sk);
 902         struct rt6_info *rt = (struct rt6_info *)dst;
 903
 904         if (!dst)
 905                 goto out;
 906
 907         /* Yes, checking route validity in not connected
 908          * case is not very simple. Take into account,
 909          * that we do not support routing by source, TOS,
 910          * and MSG_DONTROUTE            --ANK (980726)
 911          *
 912          * 1. ip6_rt_check(): If route was host route,
 913          *    check that cached destination is current.
 914          *    If it is network route, we still may
 915          *    check its validity using saved pointer
 916          *    to the last used address: daddr_cache.
 917          *    We do not want to save whole address now,
 918          *    (because main consumer of this service
 919          *    is tcp, which has not this problem),
 920          *    so that the last trick works only on connected
 921          *    sockets.
 922          * 2. oif also should be the same.
 923          */
 924         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 925 #ifdef CONFIG_IPV6_SUBTREES
 926             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 927 #endif
 928             (fl->oif && fl->oif != dst->dev->ifindex)) {
 929                 dst_release(dst);
 930                 dst = NULL;
 931         }
 932
 933 out:
 934         return dst;
 935 }
 936
 937 static int ip6_dst_lookup_tail(struct sock *sk,
 938                                struct dst_entry **dst, struct flowi *fl)
 939 {
 940         int err;
 941         struct net *net = sock_net(sk);
 942
 943         if (*dst == NULL)
 944                 *dst = ip6_route_output(net, sk, fl);
 945
 946         if ((err = (*dst)->error))
 947                 goto out_err_release;
 948
 949         if (ipv6_addr_any(&fl->fl6_src)) {
 950                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 951                                          &fl->fl6_dst,
 952                                          sk ? inet6_sk(sk)->srcprefs : 0,
 953                                          &fl->fl6_src);
 954                 if (err)
 955                         goto out_err_release;
 956         }
 957
 958 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 959         /*
 960          * Here if the dst entry we've looked up
 961          * has a neighbour entry that is in the INCOMPLETE
 962          * state and the src address from the flow is
 963          * marked as OPTIMISTIC, we release the found
 964          * dst entry and replace it instead with the
 965          * dst entry of the nexthop router
 966          */
 967         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 968                 struct inet6_ifaddr *ifp;
 969                 struct flowi fl_gw;
 970                 int redirect;
 971
 972                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 973                                       (*dst)->dev, 1);
 974
 975                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 976                 if (ifp)
 977                         in6_ifa_put(ifp);
 978
 979                 if (redirect) {
 980                         /*
 981                          * We need to get the dst entry for the
 982                          * default router instead
 983                          */
 984                         dst_release(*dst);
 985                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 986                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 987                         *dst = ip6_route_output(net, sk, &fl_gw);
 988                         if ((err = (*dst)->error))
 989                                 goto out_err_release;
 990                 }
 991         }
 992 #endif
 993
 994         return 0;
 995
 996 out_err_release:
 997         if (err == -ENETUNREACH)
 998                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 999         dst_release(*dst);
1000         *dst = NULL;
1001         return err;
1002 }
1003
1004 /**
1005  *      ip6_dst_lookup - perform route lookup on flow
1006  *      @sk: socket which provides route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow.
1011  *
1012  *      It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016         *dst = NULL;
1017         return ip6_dst_lookup_tail(sk, dst, fl);
1018 }
1019 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1020
1021 /**
1022  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1023  *      @sk: socket which provides the dst cache and route info
1024  *      @dst: pointer to dst_entry * for result
1025  *      @fl: flow to lookup
1026  *
1027  *      This function performs a route lookup on the given flow with the
1028  *      possibility of using the cached route in the socket if it is valid.
1029  *      It will take the socket dst lock when operating on the dst cache.
1030  *      As a result, this function can only be used in process context.
1031  *
1032  *      It returns zero on success, or a standard errno code on error.
1033  */
1034 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1035 {
1036         *dst = NULL;
1037         if (sk) {
1038                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1039                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1040         }
1041
1042         return ip6_dst_lookup_tail(sk, dst, fl);
1043 }
1044 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1045
1046 static inline int ip6_ufo_append_data(struct sock *sk,
1047                         int getfrag(void *from, char *to, int offset, int len,
1048                         int odd, struct sk_buff *skb),
1049                         void *from, int length, int hh_len, int fragheaderlen,
1050                         int transhdrlen, int mtu,unsigned int flags)
1051
1052 {
1053         struct sk_buff *skb;
1054         int err;
1055
1056         /* There is support for UDP large send offload by network
1057          * device, so create one single skb packet containing complete
1058          * udp datagram
1059          */
1060         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1061                 skb = sock_alloc_send_skb(sk,
1062                         hh_len + fragheaderlen + transhdrlen + 20,
1063                         (flags & MSG_DONTWAIT), &err);
1064                 if (skb == NULL)
1065                         return -ENOMEM;
1066
1067                 /* reserve space for Hardware header */
1068                 skb_reserve(skb, hh_len);
1069
1070                 /* create space for UDP/IP header */
1071                 skb_put(skb,fragheaderlen + transhdrlen);
1072
1073                 /* initialize network header pointer */
1074                 skb_reset_network_header(skb);
1075
1076                 /* initialize protocol header pointer */
1077                 skb->transport_header = skb->network_header + fragheaderlen;
1078
1079                 skb->ip_summed = CHECKSUM_PARTIAL;
1080                 skb->csum = 0;
1081                 sk->sk_sndmsg_off = 0;
1082         }
1083
1084         err = skb_append_datato_frags(sk,skb, getfrag, from,
1085                                       (length - transhdrlen));
1086         if (!err) {
1087                 struct frag_hdr fhdr;
1088
1089                 /* specify the length of each IP datagram fragment*/
1090                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1091                                             sizeof(struct frag_hdr);
1092                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1093                 ipv6_select_ident(skb, &fhdr);
1094                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1095                 __skb_queue_tail(&sk->sk_write_queue, skb);
1096
1097                 return 0;
1098         }
1099         /* There is not enough support do UPD LSO,
1100          * so follow normal path
1101          */
1102         kfree_skb(skb);
1103
1104         return err;
1105 }
1106
1107 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1108                                                gfp_t gfp)
1109 {
1110         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1111 }
1112
1113 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1114                                                 gfp_t gfp)
1115 {
1116         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1117 }
1118
1119 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1120         int offset, int len, int odd, struct sk_buff *skb),
1121         void *from, int length, int transhdrlen,
1122         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1123         struct rt6_info *rt, unsigned int flags)
1124 {
1125         struct inet_sock *inet = inet_sk(sk);
1126         struct ipv6_pinfo *np = inet6_sk(sk);
1127         struct sk_buff *skb;
1128         unsigned int maxfraglen, fragheaderlen;
1129         int exthdrlen;
1130         int hh_len;
1131         int mtu;
1132         int copy;
1133         int err;
1134         int offset = 0;
1135         int csummode = CHECKSUM_NONE;
1136
1137         if (flags&MSG_PROBE)
1138                 return 0;
1139         if (skb_queue_empty(&sk->sk_write_queue)) {
1140                 /*
1141                  * setup for corking
1142                  */
1143                 if (opt) {
1144                         if (WARN_ON(np->cork.opt))
1145                                 return -EINVAL;
1146
1147                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1148                         if (unlikely(np->cork.opt == NULL))
1149                                 return -ENOBUFS;
1150
1151                         np->cork.opt->tot_len = opt->tot_len;
1152                         np->cork.opt->opt_flen = opt->opt_flen;
1153                         np->cork.opt->opt_nflen = opt->opt_nflen;
1154
1155                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1156                                                             sk->sk_allocation);
1157                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1158                                 return -ENOBUFS;
1159
1160                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1161                                                             sk->sk_allocation);
1162                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1163                                 return -ENOBUFS;
1164
1165                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1166                                                            sk->sk_allocation);
1167                         if (opt->hopopt && !np->cork.opt->hopopt)
1168                                 return -ENOBUFS;
1169
1170                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1171                                                             sk->sk_allocation);
1172                         if (opt->srcrt && !np->cork.opt->srcrt)
1173                                 return -ENOBUFS;
1174
1175                         /* need source address above miyazawa*/
1176                 }
1177                 dst_hold(&rt->u.dst);
1178                 inet->cork.dst = &rt->u.dst;
1179                 inet->cork.fl = *fl;
1180                 np->cork.hop_limit = hlimit;
1181                 np->cork.tclass = tclass;
1182                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1183                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1184                 if (np->frag_size < mtu) {
1185                         if (np->frag_size)
1186                                 mtu = np->frag_size;
1187                 }
1188                 inet->cork.fragsize = mtu;
1189                 if (dst_allfrag(rt->u.dst.path))
1190                         inet->cork.flags |= IPCORK_ALLFRAG;
1191                 inet->cork.length = 0;
1192                 sk->sk_sndmsg_page = NULL;
1193                 sk->sk_sndmsg_off = 0;
1194                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1195                             rt->rt6i_nfheader_len;
1196                 length += exthdrlen;
1197                 transhdrlen += exthdrlen;
1198         } else {
1199                 rt = (struct rt6_info *)inet->cork.dst;
1200                 fl = &inet->cork.fl;
1201                 opt = np->cork.opt;
1202                 transhdrlen = 0;
1203                 exthdrlen = 0;
1204                 mtu = inet->cork.fragsize;
1205         }
1206
1207         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1208
1209         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1210                         (opt ? opt->opt_nflen : 0);
1211         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1212
1213         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1214                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1215                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1216                         return -EMSGSIZE;
1217                 }
1218         }
1219
1220         /*
1221          * Let's try using as much space as possible.
1222          * Use MTU if total length of the message fits into the MTU.
1223          * Otherwise, we need to reserve fragment header and
1224          * fragment alignment (= 8-15 octects, in total).
1225          *
1226          * Note that we may need to "move" the data from the tail of
1227          * of the buffer to the new fragment when we split
1228          * the message.
1229          *
1230          * FIXME: It may be fragmented into multiple chunks
1231          *        at once if non-fragmentable extension headers
1232          *        are too large.
1233          * --yoshfuji
1234          */
1235
1236         inet->cork.length += length;
1237         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1238             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1239
1240                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1241                                           fragheaderlen, transhdrlen, mtu,
1242                                           flags);
1243                 if (err)
1244                         goto error;
1245                 return 0;
1246         }
1247
1248         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1249                 goto alloc_new_skb;
1250
1251         while (length > 0) {
1252                 /* Check if the remaining data fits into current packet. */
1253                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1254                 if (copy < length)
1255                         copy = maxfraglen - skb->len;
1256
1257                 if (copy <= 0) {
1258                         char *data;
1259                         unsigned int datalen;
1260                         unsigned int fraglen;
1261                         unsigned int fraggap;
1262                         unsigned int alloclen;
1263                         struct sk_buff *skb_prev;
1264 alloc_new_skb:
1265                         skb_prev = skb;
1266
1267                         /* There's no room in the current skb */
1268                         if (skb_prev)
1269                                 fraggap = skb_prev->len - maxfraglen;
1270                         else
1271                                 fraggap = 0;
1272
1273                         /*
1274                          * If remaining data exceeds the mtu,
1275                          * we know we need more fragment(s).
1276                          */
1277                         datalen = length + fraggap;
1278                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1279                                 datalen = maxfraglen - fragheaderlen;
1280
1281                         fraglen = datalen + fragheaderlen;
1282                         if ((flags & MSG_MORE) &&
1283                             !(rt->u.dst.dev->features&NETIF_F_SG))
1284                                 alloclen = mtu;
1285                         else
1286                                 alloclen = datalen + fragheaderlen;
1287
1288                         /*
1289                          * The last fragment gets additional space at tail.
1290                          * Note: we overallocate on fragments with MSG_MODE
1291                          * because we have no idea if we're the last one.
1292                          */
1293                         if (datalen == length + fraggap)
1294                                 alloclen += rt->u.dst.trailer_len;
1295
1296                         /*
1297                          * We just reserve space for fragment header.
1298                          * Note: this may be overallocation if the message
1299                          * (without MSG_MORE) fits into the MTU.
1300                          */
1301                         alloclen += sizeof(struct frag_hdr);
1302
1303                         if (transhdrlen) {
1304                                 skb = sock_alloc_send_skb(sk,
1305                                                 alloclen + hh_len,
1306                                                 (flags & MSG_DONTWAIT), &err);
1307                         } else {
1308                                 skb = NULL;
1309                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1310                                     2 * sk->sk_sndbuf)
1311                                         skb = sock_wmalloc(sk,
1312                                                            alloclen + hh_len, 1,
1313                                                            sk->sk_allocation);
1314                                 if (unlikely(skb == NULL))
1315                                         err = -ENOBUFS;
1316                         }
1317                         if (skb == NULL)
1318                                 goto error;
1319                         /*
1320                          *      Fill in the control structures
1321                          */
1322                         skb->ip_summed = csummode;
1323                         skb->csum = 0;
1324                         /* reserve for fragmentation */
1325                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1326
1327                         /*
1328                          *      Find where to start putting bytes
1329                          */
1330                         data = skb_put(skb, fraglen);
1331                         skb_set_network_header(skb, exthdrlen);
1332                         data += fragheaderlen;
1333                         skb->transport_header = (skb->network_header +
1334                                                  fragheaderlen);
1335                         if (fraggap) {
1336                                 skb->csum = skb_copy_and_csum_bits(
1337                                         skb_prev, maxfraglen,
1338                                         data + transhdrlen, fraggap, 0);
1339                                 skb_prev->csum = csum_sub(skb_prev->csum,
1340                                                           skb->csum);
1341                                 data += fraggap;
1342                                 pskb_trim_unique(skb_prev, maxfraglen);
1343                         }
1344                         copy = datalen - transhdrlen - fraggap;
1345                         if (copy < 0) {
1346                                 err = -EINVAL;
1347                                 kfree_skb(skb);
1348                                 goto error;
1349                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1350                                 err = -EFAULT;
1351                                 kfree_skb(skb);
1352                                 goto error;
1353                         }
1354
1355                         offset += copy;
1356                         length -= datalen - fraggap;
1357                         transhdrlen = 0;
1358                         exthdrlen = 0;
1359                         csummode = CHECKSUM_NONE;
1360
1361                         /*
1362                          * Put the packet on the pending queue
1363                          */
1364                         __skb_queue_tail(&sk->sk_write_queue, skb);
1365                         continue;
1366                 }
1367
1368                 if (copy > length)
1369                         copy = length;
1370
1371                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1372                         unsigned int off;
1373
1374                         off = skb->len;
1375                         if (getfrag(from, skb_put(skb, copy),
1376                                                 offset, copy, off, skb) < 0) {
1377                                 __skb_trim(skb, off);
1378                                 err = -EFAULT;
1379                                 goto error;
1380                         }
1381                 } else {
1382                         int i = skb_shinfo(skb)->nr_frags;
1383                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1384                         struct page *page = sk->sk_sndmsg_page;
1385                         int off = sk->sk_sndmsg_off;
1386                         unsigned int left;
1387
1388                         if (page && (left = PAGE_SIZE - off) > 0) {
1389                                 if (copy >= left)
1390                                         copy = left;
1391                                 if (page != frag->page) {
1392                                         if (i == MAX_SKB_FRAGS) {
1393                                                 err = -EMSGSIZE;
1394                                                 goto error;
1395                                         }
1396                                         get_page(page);
1397                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1398                                         frag = &skb_shinfo(skb)->frags[i];
1399                                 }
1400                         } else if(i < MAX_SKB_FRAGS) {
1401                                 if (copy > PAGE_SIZE)
1402                                         copy = PAGE_SIZE;
1403                                 page = alloc_pages(sk->sk_allocation, 0);
1404                                 if (page == NULL) {
1405                                         err = -ENOMEM;
1406                                         goto error;
1407                                 }
1408                                 sk->sk_sndmsg_page = page;
1409                                 sk->sk_sndmsg_off = 0;
1410
1411                                 skb_fill_page_desc(skb, i, page, 0, 0);
1412                                 frag = &skb_shinfo(skb)->frags[i];
1413                         } else {
1414                                 err = -EMSGSIZE;
1415                                 goto error;
1416                         }
1417                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1418                                 err = -EFAULT;
1419                                 goto error;
1420                         }
1421                         sk->sk_sndmsg_off += copy;
1422                         frag->size += copy;
1423                         skb->len += copy;
1424                         skb->data_len += copy;
1425                         skb->truesize += copy;
1426                         atomic_add(copy, &sk->sk_wmem_alloc);
1427                 }
1428                 offset += copy;
1429                 length -= copy;
1430         }
1431         return 0;
1432 error:
1433         inet->cork.length -= length;
1434         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1435         return err;
1436 }
1437
1438 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1439 {
1440         if (np->cork.opt) {
1441                 kfree(np->cork.opt->dst0opt);
1442                 kfree(np->cork.opt->dst1opt);
1443                 kfree(np->cork.opt->hopopt);
1444                 kfree(np->cork.opt->srcrt);
1445                 kfree(np->cork.opt);
1446                 np->cork.opt = NULL;
1447         }
1448
1449         if (inet->cork.dst) {
1450                 dst_release(inet->cork.dst);
1451                 inet->cork.dst = NULL;
1452                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1453         }
1454         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1455 }
1456
1457 int ip6_push_pending_frames(struct sock *sk)
1458 {
1459         struct sk_buff *skb, *tmp_skb;
1460         struct sk_buff **tail_skb;
1461         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1462         struct inet_sock *inet = inet_sk(sk);
1463         struct ipv6_pinfo *np = inet6_sk(sk);
1464         struct net *net = sock_net(sk);
1465         struct ipv6hdr *hdr;
1466         struct ipv6_txoptions *opt = np->cork.opt;
1467         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1468         struct flowi *fl = &inet->cork.fl;
1469         unsigned char proto = fl->proto;
1470         int err = 0;
1471
1472         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1473                 goto out;
1474         tail_skb = &(skb_shinfo(skb)->frag_list);
1475
1476         /* move skb->data to ip header from ext header */
1477         if (skb->data < skb_network_header(skb))
1478                 __skb_pull(skb, skb_network_offset(skb));
1479         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1480                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1481                 *tail_skb = tmp_skb;
1482                 tail_skb = &(tmp_skb->next);
1483                 skb->len += tmp_skb->len;
1484                 skb->data_len += tmp_skb->len;
1485                 skb->truesize += tmp_skb->truesize;
1486                 __sock_put(tmp_skb->sk);
1487                 tmp_skb->destructor = NULL;
1488                 tmp_skb->sk = NULL;
1489         }
1490
1491         /* Allow local fragmentation. */
1492         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1493                 skb->local_df = 1;
1494
1495         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1496         __skb_pull(skb, skb_network_header_len(skb));
1497         if (opt && opt->opt_flen)
1498                 ipv6_push_frag_opts(skb, opt, &proto);
1499         if (opt && opt->opt_nflen)
1500                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1501
1502         skb_push(skb, sizeof(struct ipv6hdr));
1503         skb_reset_network_header(skb);
1504         hdr = ipv6_hdr(skb);
1505
1506         *(__be32*)hdr = fl->fl6_flowlabel |
1507                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1508
1509         hdr->hop_limit = np->cork.hop_limit;
1510         hdr->nexthdr = proto;
1511         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1512         ipv6_addr_copy(&hdr->daddr, final_dst);
1513
1514         skb->priority = sk->sk_priority;
1515         skb->mark = sk->sk_mark;
1516
1517         skb->dst = dst_clone(&rt->u.dst);
1518         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1519         if (proto == IPPROTO_ICMPV6) {
1520                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1521
1522                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1523                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1524         }
1525
1526         err = ip6_local_out(skb);
1527         if (err) {
1528                 if (err > 0)
1529                         err = np->recverr ? net_xmit_errno(err) : 0;
1530                 if (err)
1531                         goto error;
1532         }
1533
1534 out:
1535         ip6_cork_release(inet, np);
1536         return err;
1537 error:
1538         goto out;
1539 }
1540
1541 void ip6_flush_pending_frames(struct sock *sk)
1542 {
1543         struct sk_buff *skb;
1544
1545         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1546                 if (skb->dst)
1547                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
1548                                       IPSTATS_MIB_OUTDISCARDS);
1549                 kfree_skb(skb);
1550         }
1551
1552         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1553 }