net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 #ifdef CONFIG_NETFILTER
 157 int ip6_route_me_harder(struct sk_buff *skb)
 158 {
 159         struct ipv6hdr *iph = skb->nh.ipv6h;
 160         struct dst_entry *dst;
 161         struct flowi fl = {
 162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
 163                 .nl_u =
 164                 { .ip6_u =
 165                   { .daddr = iph->daddr,
 166                     .saddr = iph->saddr, } },
 167                 .proto = iph->nexthdr,
 168         };
 169
 170         dst = ip6_route_output(skb->sk, &fl);
 171
 172         if (dst->error) {
 173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 174                 LIMIT_NETDEBUG(
 175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
 176                 dst_release(dst);
 177                 return -EINVAL;
 178         }
 179
 180         /* Drop old route. */
 181         dst_release(skb->dst);
 182
 183         skb->dst = dst;
 184         return 0;
 185 }
 186 #endif
 187
 188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
 189 {
 190 #ifdef CONFIG_NETFILTER
 191         if (skb->nfcache & NFC_ALTERED){
 192                 if (ip6_route_me_harder(skb) != 0){
 193                         kfree_skb(skb);
 194                         return -EINVAL;
 195                 }
 196         }
 197 #endif /* CONFIG_NETFILTER */
 198         return dst_output(skb);
 199 }
 200
 201 /*
 202  *      xmit an sk_buff (used by TCP)
 203  */
 204
 205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 206              struct ipv6_txoptions *opt, int ipfragok)
 207 {
 208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 209         struct in6_addr *first_hop = &fl->fl6_dst;
 210         struct dst_entry *dst = skb->dst;
 211         struct ipv6hdr *hdr;
 212         u8  proto = fl->proto;
 213         int seg_len = skb->len;
 214         int hlimit;
 215         u32 mtu;
 216
 217         if (opt) {
 218                 int head_room;
 219
 220                 /* First: exthdrs may take lots of space (~8K for now)
 221                    MAX_HEADER is not enough.
 222                  */
 223                 head_room = opt->opt_nflen + opt->opt_flen;
 224                 seg_len += head_room;
 225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 226
 227                 if (skb_headroom(skb) < head_room) {
 228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 229                         kfree_skb(skb);
 230                         skb = skb2;
 231                         if (skb == NULL) {
 232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 233                                 return -ENOBUFS;
 234                         }
 235                         if (sk)
 236                                 skb_set_owner_w(skb, sk);
 237                 }
 238                 if (opt->opt_flen)
 239                         ipv6_push_frag_opts(skb, opt, &proto);
 240                 if (opt->opt_nflen)
 241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 242         }
 243
 244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 245
 246         /*
 247          *      Fill in the IPv6 header
 248          */
 249
 250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
 251         hlimit = -1;
 252         if (np)
 253                 hlimit = np->hop_limit;
 254         if (hlimit < 0)
 255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 256         if (hlimit < 0)
 257                 hlimit = ipv6_get_hoplimit(dst->dev);
 258
 259         hdr->payload_len = htons(seg_len);
 260         hdr->nexthdr = proto;
 261         hdr->hop_limit = hlimit;
 262
 263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 264         ipv6_addr_copy(&hdr->daddr, first_hop);
 265
 266         mtu = dst_mtu(dst);
 267         if ((skb->len <= mtu) || ipfragok) {
 268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
 270         }
 271
 272         if (net_ratelimit())
 273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 274         skb->dev = dst->dev;
 275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 277         kfree_skb(skb);
 278         return -EMSGSIZE;
 279 }
 280
 281 /*
 282  *      To avoid extra problems ND packets are send through this
 283  *      routine. It's code duplication but I really want to avoid
 284  *      extra checks since ipv6_build_header is used by TCP (which
 285  *      is for us performance critical)
 286  */
 287
 288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 289                struct in6_addr *saddr, struct in6_addr *daddr,
 290                int proto, int len)
 291 {
 292         struct ipv6_pinfo *np = inet6_sk(sk);
 293         struct ipv6hdr *hdr;
 294         int totlen;
 295
 296         skb->protocol = htons(ETH_P_IPV6);
 297         skb->dev = dev;
 298
 299         totlen = len + sizeof(struct ipv6hdr);
 300
 301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 302         skb->nh.ipv6h = hdr;
 303
 304         *(u32*)hdr = htonl(0x60000000);
 305
 306         hdr->payload_len = htons(len);
 307         hdr->nexthdr = proto;
 308         hdr->hop_limit = np->hop_limit;
 309
 310         ipv6_addr_copy(&hdr->saddr, saddr);
 311         ipv6_addr_copy(&hdr->daddr, daddr);
 312
 313         return 0;
 314 }
 315
 316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 317 {
 318         struct ip6_ra_chain *ra;
 319         struct sock *last = NULL;
 320
 321         read_lock(&ip6_ra_lock);
 322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 323                 struct sock *sk = ra->sk;
 324                 if (sk && ra->sel == sel) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static inline int ip6_forward_finish(struct sk_buff *skb)
 344 {
 345         return dst_output(skb);
 346 }
 347
 348 int ip6_forward(struct sk_buff *skb)
 349 {
 350         struct dst_entry *dst = skb->dst;
 351         struct ipv6hdr *hdr = skb->nh.ipv6h;
 352         struct inet6_skb_parm *opt = IP6CB(skb);
 353
 354         if (ipv6_devconf.forwarding == 0)
 355                 goto error;
 356
 357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 359                 goto drop;
 360         }
 361
 362         skb->ip_summed = CHECKSUM_NONE;
 363
 364         /*
 365          *      We DO NOT make any processing on
 366          *      RA packets, pushing them to user level AS IS
 367          *      without ane WARRANTY that application will be able
 368          *      to interpret them. The reason is that we
 369          *      cannot make anything clever here.
 370          *
 371          *      We are not end-node, so that if packet contains
 372          *      AH/ESP, we cannot make anything.
 373          *      Defragmentation also would be mistake, RA packets
 374          *      cannot be fragmented, because there is no warranty
 375          *      that different fragments will go along one path. --ANK
 376          */
 377         if (opt->ra) {
 378                 u8 *ptr = skb->nh.raw + opt->ra;
 379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 380                         return 0;
 381         }
 382
 383         /*
 384          *      check and decrement ttl
 385          */
 386         if (hdr->hop_limit <= 1) {
 387                 /* Force OUTPUT device used as source address */
 388                 skb->dev = dst->dev;
 389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 390                             0, skb->dev);
 391
 392                 kfree_skb(skb);
 393                 return -ETIMEDOUT;
 394         }
 395
 396         if (!xfrm6_route_forward(skb)) {
 397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 398                 goto drop;
 399         }
 400         dst = skb->dst;
 401
 402         /* IPv6 specs say nothing about it, but it is clear that we cannot
 403            send redirects to source routed frames.
 404          */
 405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 406                 struct in6_addr *target = NULL;
 407                 struct rt6_info *rt;
 408                 struct neighbour *n = dst->neighbour;
 409
 410                 /*
 411                  *      incoming and outgoing devices are the same
 412                  *      send a redirect.
 413                  */
 414
 415                 rt = (struct rt6_info *) dst;
 416                 if ((rt->rt6i_flags & RTF_GATEWAY))
 417                         target = (struct in6_addr*)&n->primary_key;
 418                 else
 419                         target = &hdr->daddr;
 420
 421                 /* Limit redirects both by destination (here)
 422                    and by source (inside ndisc_send_redirect)
 423                  */
 424                 if (xrlim_allow(dst, 1*HZ))
 425                         ndisc_send_redirect(skb, n, target);
 426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 427                                                 |IPV6_ADDR_LINKLOCAL)) {
 428                 /* This check is security critical. */
 429                 goto error;
 430         }
 431
 432         if (skb->len > dst_mtu(dst)) {
 433                 /* Again, force OUTPUT device used as source address */
 434                 skb->dev = dst->dev;
 435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 438                 kfree_skb(skb);
 439                 return -EMSGSIZE;
 440         }
 441
 442         if (skb_cow(skb, dst->dev->hard_header_len)) {
 443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 444                 goto drop;
 445         }
 446
 447         hdr = skb->nh.ipv6h;
 448
 449         /* Mangling hops number delayed to point after skb COW */
 450
 451         hdr->hop_limit--;
 452
 453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 455
 456 error:
 457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 458 drop:
 459         kfree_skb(skb);
 460         return -EINVAL;
 461 }
 462
 463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 464 {
 465         to->pkt_type = from->pkt_type;
 466         to->priority = from->priority;
 467         to->protocol = from->protocol;
 468         to->security = from->security;
 469         dst_release(to->dst);
 470         to->dst = dst_clone(from->dst);
 471         to->dev = from->dev;
 472
 473 #ifdef CONFIG_NET_SCHED
 474         to->tc_index = from->tc_index;
 475 #endif
 476 #ifdef CONFIG_NETFILTER
 477         to->nfmark = from->nfmark;
 478         /* Connection association is same as pre-frag packet */
 479         to->nfct = from->nfct;
 480         nf_conntrack_get(to->nfct);
 481         to->nfctinfo = from->nfctinfo;
 482 #ifdef CONFIG_BRIDGE_NETFILTER
 483         nf_bridge_put(to->nf_bridge);
 484         to->nf_bridge = from->nf_bridge;
 485         nf_bridge_get(to->nf_bridge);
 486 #endif
 487 #ifdef CONFIG_NETFILTER_DEBUG
 488         to->nf_debug = from->nf_debug;
 489 #endif
 490 #endif
 491 }
 492
 493 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 494 {
 495         u16 offset = sizeof(struct ipv6hdr);
 496         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 497         unsigned int packet_len = skb->tail - skb->nh.raw;
 498         int found_rhdr = 0;
 499         *nexthdr = &skb->nh.ipv6h->nexthdr;
 500
 501         while (offset + 1 <= packet_len) {
 502
 503                 switch (**nexthdr) {
 504
 505                 case NEXTHDR_HOP:
 506                 case NEXTHDR_ROUTING:
 507                 case NEXTHDR_DEST:
 508                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 509                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 510                         offset += ipv6_optlen(exthdr);
 511                         *nexthdr = &exthdr->nexthdr;
 512                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 513                         break;
 514                 default :
 515                         return offset;
 516                 }
 517         }
 518
 519         return offset;
 520 }
 521
 522 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 523 {
 524         struct net_device *dev;
 525         struct sk_buff *frag;
 526         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 527         struct ipv6hdr *tmp_hdr;
 528         struct frag_hdr *fh;
 529         unsigned int mtu, hlen, left, len;
 530         u32 frag_id = 0;
 531         int ptr, offset = 0, err=0;
 532         u8 *prevhdr, nexthdr = 0;
 533
 534         dev = rt->u.dst.dev;
 535         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 536         nexthdr = *prevhdr;
 537
 538         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 539
 540         if (skb_shinfo(skb)->frag_list) {
 541                 int first_len = skb_pagelen(skb);
 542
 543                 if (first_len - hlen > mtu ||
 544                     ((first_len - hlen) & 7) ||
 545                     skb_cloned(skb))
 546                         goto slow_path;
 547
 548                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 549                         /* Correct geometry. */
 550                         if (frag->len > mtu ||
 551                             ((frag->len & 7) && frag->next) ||
 552                             skb_headroom(frag) < hlen)
 553                             goto slow_path;
 554
 555                         /* Partially cloned skb? */
 556                         if (skb_shared(frag))
 557                                 goto slow_path;
 558
 559                         BUG_ON(frag->sk);
 560                         if (skb->sk) {
 561                                 sock_hold(skb->sk);
 562                                 frag->sk = skb->sk;
 563                                 frag->destructor = sock_wfree;
 564                                 skb->truesize -= frag->truesize;
 565                         }
 566                 }
 567
 568                 err = 0;
 569                 offset = 0;
 570                 frag = skb_shinfo(skb)->frag_list;
 571                 skb_shinfo(skb)->frag_list = NULL;
 572                 /* BUILD HEADER */
 573
 574                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 575                 if (!tmp_hdr) {
 576                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 577                         return -ENOMEM;
 578                 }
 579
 580                 *prevhdr = NEXTHDR_FRAGMENT;
 581                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 582                 __skb_pull(skb, hlen);
 583                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 584                 skb->nh.raw = __skb_push(skb, hlen);
 585                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 586
 587                 ipv6_select_ident(skb, fh);
 588                 fh->nexthdr = nexthdr;
 589                 fh->reserved = 0;
 590                 fh->frag_off = htons(IP6_MF);
 591                 frag_id = fh->identification;
 592
 593                 first_len = skb_pagelen(skb);
 594                 skb->data_len = first_len - skb_headlen(skb);
 595                 skb->len = first_len;
 596                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 597
 598
 599                 for (;;) {
 600                         /* Prepare header of the next frame,
 601                          * before previous one went down. */
 602                         if (frag) {
 603                                 frag->ip_summed = CHECKSUM_NONE;
 604                                 frag->h.raw = frag->data;
 605                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 606                                 frag->nh.raw = __skb_push(frag, hlen);
 607                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 608                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 609                                 fh->nexthdr = nexthdr;
 610                                 fh->reserved = 0;
 611                                 fh->frag_off = htons(offset);
 612                                 if (frag->next != NULL)
 613                                         fh->frag_off |= htons(IP6_MF);
 614                                 fh->identification = frag_id;
 615                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 616                                 ip6_copy_metadata(frag, skb);
 617                         }
 618
 619                         err = output(skb);
 620                         if (err || !frag)
 621                                 break;
 622
 623                         skb = frag;
 624                         frag = skb->next;
 625                         skb->next = NULL;
 626                 }
 627
 628                 if (tmp_hdr)
 629                         kfree(tmp_hdr);
 630
 631                 if (err == 0) {
 632                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 633                         return 0;
 634                 }
 635
 636                 while (frag) {
 637                         skb = frag->next;
 638                         kfree_skb(frag);
 639                         frag = skb;
 640                 }
 641
 642                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 643                 return err;
 644         }
 645
 646 slow_path:
 647         left = skb->len - hlen;         /* Space per frame */
 648         ptr = hlen;                     /* Where to start from */
 649
 650         /*
 651          *      Fragment the datagram.
 652          */
 653
 654         *prevhdr = NEXTHDR_FRAGMENT;
 655
 656         /*
 657          *      Keep copying data until we run out.
 658          */
 659         while(left > 0) {
 660                 len = left;
 661                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 662                 if (len > mtu)
 663                         len = mtu;
 664                 /* IF: we are not sending upto and including the packet end
 665                    then align the next start on an eight byte boundary */
 666                 if (len < left) {
 667                         len &= ~7;
 668                 }
 669                 /*
 670                  *      Allocate buffer.
 671                  */
 672
 673                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 674                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
 675                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 676                         err = -ENOMEM;
 677                         goto fail;
 678                 }
 679
 680                 /*
 681                  *      Set up data on packet
 682                  */
 683
 684                 ip6_copy_metadata(frag, skb);
 685                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 686                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 687                 frag->nh.raw = frag->data;
 688                 fh = (struct frag_hdr*)(frag->data + hlen);
 689                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 690
 691                 /*
 692                  *      Charge the memory for the fragment to any owner
 693                  *      it might possess
 694                  */
 695                 if (skb->sk)
 696                         skb_set_owner_w(frag, skb->sk);
 697
 698                 /*
 699                  *      Copy the packet header into the new buffer.
 700                  */
 701                 memcpy(frag->nh.raw, skb->data, hlen);
 702
 703                 /*
 704                  *      Build fragment header.
 705                  */
 706                 fh->nexthdr = nexthdr;
 707                 fh->reserved = 0;
 708                 if (frag_id) {
 709                         ipv6_select_ident(skb, fh);
 710                         frag_id = fh->identification;
 711                 } else
 712                         fh->identification = frag_id;
 713
 714                 /*
 715                  *      Copy a block of the IP datagram.
 716                  */
 717                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 718                         BUG();
 719                 left -= len;
 720
 721                 fh->frag_off = htons(offset);
 722                 if (left > 0)
 723                         fh->frag_off |= htons(IP6_MF);
 724                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 725
 726                 ptr += len;
 727                 offset += len;
 728
 729                 /*
 730                  *      Put this fragment into the sending queue.
 731                  */
 732
 733                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 734
 735                 err = output(frag);
 736                 if (err)
 737                         goto fail;
 738         }
 739         kfree_skb(skb);
 740         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 741         return err;
 742
 743 fail:
 744         kfree_skb(skb);
 745         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 746         return err;
 747 }
 748
 749 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 750 {
 751         int err = 0;
 752
 753         *dst = NULL;
 754         if (sk) {
 755                 struct ipv6_pinfo *np = inet6_sk(sk);
 756
 757                 *dst = sk_dst_check(sk, np->dst_cookie);
 758                 if (*dst) {
 759                         struct rt6_info *rt = (struct rt6_info*)*dst;
 760
 761                                 /* Yes, checking route validity in not connected
 762                                    case is not very simple. Take into account,
 763                                    that we do not support routing by source, TOS,
 764                                    and MSG_DONTROUTE            --ANK (980726)
 765
 766                                    1. If route was host route, check that
 767                                       cached destination is current.
 768                                       If it is network route, we still may
 769                                       check its validity using saved pointer
 770                                       to the last used address: daddr_cache.
 771                                       We do not want to save whole address now,
 772                                       (because main consumer of this service
 773                                        is tcp, which has not this problem),
 774                                       so that the last trick works only on connected
 775                                       sockets.
 776                                    2. oif also should be the same.
 777                                  */
 778
 779                         if (((rt->rt6i_dst.plen != 128 ||
 780                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 781                              && (np->daddr_cache == NULL ||
 782                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 783                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 784                                 dst_release(*dst);
 785                                 *dst = NULL;
 786                         }
 787                 }
 788         }
 789
 790         if (*dst == NULL)
 791                 *dst = ip6_route_output(sk, fl);
 792
 793         if ((err = (*dst)->error))
 794                 goto out_err_release;
 795
 796         if (ipv6_addr_any(&fl->fl6_src)) {
 797                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 798
 799                 if (err) {
 800 #if IP6_DEBUG >= 2
 801                         printk(KERN_DEBUG "ip6_dst_lookup: "
 802                                "no available source address\n");
 803 #endif
 804                         goto out_err_release;
 805                 }
 806         }
 807
 808         return 0;
 809
 810 out_err_release:
 811         dst_release(*dst);
 812         *dst = NULL;
 813         return err;
 814 }
 815
 816 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
 817                     void *from, int length, int transhdrlen,
 818                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
 819                     unsigned int flags)
 820 {
 821         struct inet_sock *inet = inet_sk(sk);
 822         struct ipv6_pinfo *np = inet6_sk(sk);
 823         struct sk_buff *skb;
 824         unsigned int maxfraglen, fragheaderlen;
 825         int exthdrlen;
 826         int hh_len;
 827         int mtu;
 828         int copy;
 829         int err;
 830         int offset = 0;
 831         int csummode = CHECKSUM_NONE;
 832
 833         if (flags&MSG_PROBE)
 834                 return 0;
 835         if (skb_queue_empty(&sk->sk_write_queue)) {
 836                 /*
 837                  * setup for corking
 838                  */
 839                 if (opt) {
 840                         if (np->cork.opt == NULL) {
 841                                 np->cork.opt = kmalloc(opt->tot_len,
 842                                                        sk->sk_allocation);
 843                                 if (unlikely(np->cork.opt == NULL))
 844                                         return -ENOBUFS;
 845                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 846                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 847                                 return -EINVAL;
 848                         }
 849                         memcpy(np->cork.opt, opt, opt->tot_len);
 850                         inet->cork.flags |= IPCORK_OPT;
 851                         /* need source address above miyazawa*/
 852                 }
 853                 dst_hold(&rt->u.dst);
 854                 np->cork.rt = rt;
 855                 inet->cork.fl = *fl;
 856                 np->cork.hop_limit = hlimit;
 857                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 858                 if (dst_allfrag(rt->u.dst.path))
 859                         inet->cork.flags |= IPCORK_ALLFRAG;
 860                 inet->cork.length = 0;
 861                 sk->sk_sndmsg_page = NULL;
 862                 sk->sk_sndmsg_off = 0;
 863                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 864                 length += exthdrlen;
 865                 transhdrlen += exthdrlen;
 866         } else {
 867                 rt = np->cork.rt;
 868                 fl = &inet->cork.fl;
 869                 if (inet->cork.flags & IPCORK_OPT)
 870                         opt = np->cork.opt;
 871                 transhdrlen = 0;
 872                 exthdrlen = 0;
 873                 mtu = inet->cork.fragsize;
 874         }
 875
 876         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 877
 878         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 879         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 880
 881         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 882                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 883                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 884                         return -EMSGSIZE;
 885                 }
 886         }
 887
 888         /*
 889          * Let's try using as much space as possible.
 890          * Use MTU if total length of the message fits into the MTU.
 891          * Otherwise, we need to reserve fragment header and
 892          * fragment alignment (= 8-15 octects, in total).
 893          *
 894          * Note that we may need to "move" the data from the tail of
 895          * of the buffer to the new fragment when we split
 896          * the message.
 897          *
 898          * FIXME: It may be fragmented into multiple chunks
 899          *        at once if non-fragmentable extension headers
 900          *        are too large.
 901          * --yoshfuji
 902          */
 903
 904         inet->cork.length += length;
 905
 906         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 907                 goto alloc_new_skb;
 908
 909         while (length > 0) {
 910                 /* Check if the remaining data fits into current packet. */
 911                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
 912                 if (copy < length)
 913                         copy = maxfraglen - skb->len;
 914
 915                 if (copy <= 0) {
 916                         char *data;
 917                         unsigned int datalen;
 918                         unsigned int fraglen;
 919                         unsigned int fraggap;
 920                         unsigned int alloclen;
 921                         struct sk_buff *skb_prev;
 922 alloc_new_skb:
 923                         skb_prev = skb;
 924
 925                         /* There's no room in the current skb */
 926                         if (skb_prev)
 927                                 fraggap = skb_prev->len - maxfraglen;
 928                         else
 929                                 fraggap = 0;
 930
 931                         /*
 932                          * If remaining data exceeds the mtu,
 933                          * we know we need more fragment(s).
 934                          */
 935                         datalen = length + fraggap;
 936                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 937                                 datalen = maxfraglen - fragheaderlen;
 938
 939                         fraglen = datalen + fragheaderlen;
 940                         if ((flags & MSG_MORE) &&
 941                             !(rt->u.dst.dev->features&NETIF_F_SG))
 942                                 alloclen = mtu;
 943                         else
 944                                 alloclen = datalen + fragheaderlen;
 945
 946                         /*
 947                          * The last fragment gets additional space at tail.
 948                          * Note: we overallocate on fragments with MSG_MODE
 949                          * because we have no idea if we're the last one.
 950                          */
 951                         if (datalen == length + fraggap)
 952                                 alloclen += rt->u.dst.trailer_len;
 953
 954                         /*
 955                          * We just reserve space for fragment header.
 956                          * Note: this may be overallocation if the message
 957                          * (without MSG_MORE) fits into the MTU.
 958                          */
 959                         alloclen += sizeof(struct frag_hdr);
 960
 961                         if (transhdrlen) {
 962                                 skb = sock_alloc_send_skb(sk,
 963                                                 alloclen + hh_len,
 964                                                 (flags & MSG_DONTWAIT), &err);
 965                         } else {
 966                                 skb = NULL;
 967                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 968                                     2 * sk->sk_sndbuf)
 969                                         skb = sock_wmalloc(sk,
 970                                                            alloclen + hh_len, 1,
 971                                                            sk->sk_allocation);
 972                                 if (unlikely(skb == NULL))
 973                                         err = -ENOBUFS;
 974                         }
 975                         if (skb == NULL)
 976                                 goto error;
 977                         /*
 978                          *      Fill in the control structures
 979                          */
 980                         skb->ip_summed = csummode;
 981                         skb->csum = 0;
 982                         /* reserve for fragmentation */
 983                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 984
 985                         /*
 986                          *      Find where to start putting bytes
 987                          */
 988                         data = skb_put(skb, fraglen);
 989                         skb->nh.raw = data + exthdrlen;
 990                         data += fragheaderlen;
 991                         skb->h.raw = data + exthdrlen;
 992
 993                         if (fraggap) {
 994                                 skb->csum = skb_copy_and_csum_bits(
 995                                         skb_prev, maxfraglen,
 996                                         data + transhdrlen, fraggap, 0);
 997                                 skb_prev->csum = csum_sub(skb_prev->csum,
 998                                                           skb->csum);
 999                                 data += fraggap;
1000                                 skb_trim(skb_prev, maxfraglen);
1001                         }
1002                         copy = datalen - transhdrlen - fraggap;
1003                         if (copy < 0) {
1004                                 err = -EINVAL;
1005                                 kfree_skb(skb);
1006                                 goto error;
1007                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1008                                 err = -EFAULT;
1009                                 kfree_skb(skb);
1010                                 goto error;
1011                         }
1012
1013                         offset += copy;
1014                         length -= datalen - fraggap;
1015                         transhdrlen = 0;
1016                         exthdrlen = 0;
1017                         csummode = CHECKSUM_NONE;
1018
1019                         /*
1020                          * Put the packet on the pending queue
1021                          */
1022                         __skb_queue_tail(&sk->sk_write_queue, skb);
1023                         continue;
1024                 }
1025
1026                 if (copy > length)
1027                         copy = length;
1028
1029                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1030                         unsigned int off;
1031
1032                         off = skb->len;
1033                         if (getfrag(from, skb_put(skb, copy),
1034                                                 offset, copy, off, skb) < 0) {
1035                                 __skb_trim(skb, off);
1036                                 err = -EFAULT;
1037                                 goto error;
1038                         }
1039                 } else {
1040                         int i = skb_shinfo(skb)->nr_frags;
1041                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1042                         struct page *page = sk->sk_sndmsg_page;
1043                         int off = sk->sk_sndmsg_off;
1044                         unsigned int left;
1045
1046                         if (page && (left = PAGE_SIZE - off) > 0) {
1047                                 if (copy >= left)
1048                                         copy = left;
1049                                 if (page != frag->page) {
1050                                         if (i == MAX_SKB_FRAGS) {
1051                                                 err = -EMSGSIZE;
1052                                                 goto error;
1053                                         }
1054                                         get_page(page);
1055                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1056                                         frag = &skb_shinfo(skb)->frags[i];
1057                                 }
1058                         } else if(i < MAX_SKB_FRAGS) {
1059                                 if (copy > PAGE_SIZE)
1060                                         copy = PAGE_SIZE;
1061                                 page = alloc_pages(sk->sk_allocation, 0);
1062                                 if (page == NULL) {
1063                                         err = -ENOMEM;
1064                                         goto error;
1065                                 }
1066                                 sk->sk_sndmsg_page = page;
1067                                 sk->sk_sndmsg_off = 0;
1068
1069                                 skb_fill_page_desc(skb, i, page, 0, 0);
1070                                 frag = &skb_shinfo(skb)->frags[i];
1071                                 skb->truesize += PAGE_SIZE;
1072                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1073                         } else {
1074                                 err = -EMSGSIZE;
1075                                 goto error;
1076                         }
1077                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1078                                 err = -EFAULT;
1079                                 goto error;
1080                         }
1081                         sk->sk_sndmsg_off += copy;
1082                         frag->size += copy;
1083                         skb->len += copy;
1084                         skb->data_len += copy;
1085                 }
1086                 offset += copy;
1087                 length -= copy;
1088         }
1089         return 0;
1090 error:
1091         inet->cork.length -= length;
1092         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1093         return err;
1094 }
1095
1096 int ip6_push_pending_frames(struct sock *sk)
1097 {
1098         struct sk_buff *skb, *tmp_skb;
1099         struct sk_buff **tail_skb;
1100         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1101         struct inet_sock *inet = inet_sk(sk);
1102         struct ipv6_pinfo *np = inet6_sk(sk);
1103         struct ipv6hdr *hdr;
1104         struct ipv6_txoptions *opt = np->cork.opt;
1105         struct rt6_info *rt = np->cork.rt;
1106         struct flowi *fl = &inet->cork.fl;
1107         unsigned char proto = fl->proto;
1108         int err = 0;
1109
1110         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1111                 goto out;
1112         tail_skb = &(skb_shinfo(skb)->frag_list);
1113
1114         /* move skb->data to ip header from ext header */
1115         if (skb->data < skb->nh.raw)
1116                 __skb_pull(skb, skb->nh.raw - skb->data);
1117         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1118                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1119                 *tail_skb = tmp_skb;
1120                 tail_skb = &(tmp_skb->next);
1121                 skb->len += tmp_skb->len;
1122                 skb->data_len += tmp_skb->len;
1123                 skb->truesize += tmp_skb->truesize;
1124                 __sock_put(tmp_skb->sk);
1125                 tmp_skb->destructor = NULL;
1126                 tmp_skb->sk = NULL;
1127         }
1128
1129         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1130         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1131         if (opt && opt->opt_flen)
1132                 ipv6_push_frag_opts(skb, opt, &proto);
1133         if (opt && opt->opt_nflen)
1134                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1135
1136         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1137
1138         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1139
1140         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1141                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1142         else
1143                 hdr->payload_len = 0;
1144         hdr->hop_limit = np->cork.hop_limit;
1145         hdr->nexthdr = proto;
1146         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1147         ipv6_addr_copy(&hdr->daddr, final_dst);
1148
1149         skb->dst = dst_clone(&rt->u.dst);
1150         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1151         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1152         if (err) {
1153                 if (err > 0)
1154                         err = np->recverr ? net_xmit_errno(err) : 0;
1155                 if (err)
1156                         goto error;
1157         }
1158
1159 out:
1160         inet->cork.flags &= ~IPCORK_OPT;
1161         if (np->cork.opt) {
1162                 kfree(np->cork.opt);
1163                 np->cork.opt = NULL;
1164         }
1165         if (np->cork.rt) {
1166                 dst_release(&np->cork.rt->u.dst);
1167                 np->cork.rt = NULL;
1168                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1169         }
1170         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1171         return err;
1172 error:
1173         goto out;
1174 }
1175
1176 void ip6_flush_pending_frames(struct sock *sk)
1177 {
1178         struct inet_sock *inet = inet_sk(sk);
1179         struct ipv6_pinfo *np = inet6_sk(sk);
1180         struct sk_buff *skb;
1181
1182         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1183                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1184                 kfree_skb(skb);
1185         }
1186
1187         inet->cork.flags &= ~IPCORK_OPT;
1188
1189         if (np->cork.opt) {
1190                 kfree(np->cork.opt);
1191                 np->cork.opt = NULL;
1192         }
1193         if (np->cork.rt) {
1194                 dst_release(&np->cork.rt->u.dst);
1195                 np->cork.rt = NULL;
1196                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1197         }
1198         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199 }