net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 #ifdef CONFIG_NETFILTER
 157 int ip6_route_me_harder(struct sk_buff *skb)
 158 {
 159         struct ipv6hdr *iph = skb->nh.ipv6h;
 160         struct dst_entry *dst;
 161         struct flowi fl = {
 162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
 163                 .nl_u =
 164                 { .ip6_u =
 165                   { .daddr = iph->daddr,
 166                     .saddr = iph->saddr, } },
 167                 .proto = iph->nexthdr,
 168         };
 169
 170         dst = ip6_route_output(skb->sk, &fl);
 171
 172         if (dst->error) {
 173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 174                 LIMIT_NETDEBUG(
 175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
 176                 dst_release(dst);
 177                 return -EINVAL;
 178         }
 179
 180         /* Drop old route. */
 181         dst_release(skb->dst);
 182
 183         skb->dst = dst;
 184         return 0;
 185 }
 186 #endif
 187
 188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
 189 {
 190 #ifdef CONFIG_NETFILTER
 191         if (skb->nfcache & NFC_ALTERED){
 192                 if (ip6_route_me_harder(skb) != 0){
 193                         kfree_skb(skb);
 194                         return -EINVAL;
 195                 }
 196         }
 197 #endif /* CONFIG_NETFILTER */
 198         return dst_output(skb);
 199 }
 200
 201 /*
 202  *      xmit an sk_buff (used by TCP)
 203  */
 204
 205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 206              struct ipv6_txoptions *opt, int ipfragok)
 207 {
 208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 209         struct in6_addr *first_hop = &fl->fl6_dst;
 210         struct dst_entry *dst = skb->dst;
 211         struct ipv6hdr *hdr;
 212         u8  proto = fl->proto;
 213         int seg_len = skb->len;
 214         int hlimit;
 215         u32 mtu;
 216
 217         if (opt) {
 218                 int head_room;
 219
 220                 /* First: exthdrs may take lots of space (~8K for now)
 221                    MAX_HEADER is not enough.
 222                  */
 223                 head_room = opt->opt_nflen + opt->opt_flen;
 224                 seg_len += head_room;
 225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 226
 227                 if (skb_headroom(skb) < head_room) {
 228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 229                         kfree_skb(skb);
 230                         skb = skb2;
 231                         if (skb == NULL) {
 232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 233                                 return -ENOBUFS;
 234                         }
 235                         if (sk)
 236                                 skb_set_owner_w(skb, sk);
 237                 }
 238                 if (opt->opt_flen)
 239                         ipv6_push_frag_opts(skb, opt, &proto);
 240                 if (opt->opt_nflen)
 241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 242         }
 243
 244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 245
 246         /*
 247          *      Fill in the IPv6 header
 248          */
 249
 250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
 251         hlimit = -1;
 252         if (np)
 253                 hlimit = np->hop_limit;
 254         if (hlimit < 0)
 255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 256         if (hlimit < 0)
 257                 hlimit = ipv6_get_hoplimit(dst->dev);
 258
 259         hdr->payload_len = htons(seg_len);
 260         hdr->nexthdr = proto;
 261         hdr->hop_limit = hlimit;
 262
 263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 264         ipv6_addr_copy(&hdr->daddr, first_hop);
 265
 266         mtu = dst_mtu(dst);
 267         if ((skb->len <= mtu) || ipfragok) {
 268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
 270         }
 271
 272         if (net_ratelimit())
 273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 274         skb->dev = dst->dev;
 275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 277         kfree_skb(skb);
 278         return -EMSGSIZE;
 279 }
 280
 281 /*
 282  *      To avoid extra problems ND packets are send through this
 283  *      routine. It's code duplication but I really want to avoid
 284  *      extra checks since ipv6_build_header is used by TCP (which
 285  *      is for us performance critical)
 286  */
 287
 288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 289                struct in6_addr *saddr, struct in6_addr *daddr,
 290                int proto, int len)
 291 {
 292         struct ipv6_pinfo *np = inet6_sk(sk);
 293         struct ipv6hdr *hdr;
 294         int totlen;
 295
 296         skb->protocol = htons(ETH_P_IPV6);
 297         skb->dev = dev;
 298
 299         totlen = len + sizeof(struct ipv6hdr);
 300
 301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 302         skb->nh.ipv6h = hdr;
 303
 304         *(u32*)hdr = htonl(0x60000000);
 305
 306         hdr->payload_len = htons(len);
 307         hdr->nexthdr = proto;
 308         hdr->hop_limit = np->hop_limit;
 309
 310         ipv6_addr_copy(&hdr->saddr, saddr);
 311         ipv6_addr_copy(&hdr->daddr, daddr);
 312
 313         return 0;
 314 }
 315
 316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 317 {
 318         struct ip6_ra_chain *ra;
 319         struct sock *last = NULL;
 320
 321         read_lock(&ip6_ra_lock);
 322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 323                 struct sock *sk = ra->sk;
 324                 if (sk && ra->sel == sel) {
 325                         if (last) {
 326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 327                                 if (skb2)
 328                                         rawv6_rcv(last, skb2);
 329                         }
 330                         last = sk;
 331                 }
 332         }
 333
 334         if (last) {
 335                 rawv6_rcv(last, skb);
 336                 read_unlock(&ip6_ra_lock);
 337                 return 1;
 338         }
 339         read_unlock(&ip6_ra_lock);
 340         return 0;
 341 }
 342
 343 static inline int ip6_forward_finish(struct sk_buff *skb)
 344 {
 345         return dst_output(skb);
 346 }
 347
 348 int ip6_forward(struct sk_buff *skb)
 349 {
 350         struct dst_entry *dst = skb->dst;
 351         struct ipv6hdr *hdr = skb->nh.ipv6h;
 352         struct inet6_skb_parm *opt = IP6CB(skb);
 353
 354         if (ipv6_devconf.forwarding == 0)
 355                 goto error;
 356
 357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 359                 goto drop;
 360         }
 361
 362         skb->ip_summed = CHECKSUM_NONE;
 363
 364         /*
 365          *      We DO NOT make any processing on
 366          *      RA packets, pushing them to user level AS IS
 367          *      without ane WARRANTY that application will be able
 368          *      to interpret them. The reason is that we
 369          *      cannot make anything clever here.
 370          *
 371          *      We are not end-node, so that if packet contains
 372          *      AH/ESP, we cannot make anything.
 373          *      Defragmentation also would be mistake, RA packets
 374          *      cannot be fragmented, because there is no warranty
 375          *      that different fragments will go along one path. --ANK
 376          */
 377         if (opt->ra) {
 378                 u8 *ptr = skb->nh.raw + opt->ra;
 379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 380                         return 0;
 381         }
 382
 383         /*
 384          *      check and decrement ttl
 385          */
 386         if (hdr->hop_limit <= 1) {
 387                 /* Force OUTPUT device used as source address */
 388                 skb->dev = dst->dev;
 389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 390                             0, skb->dev);
 391
 392                 kfree_skb(skb);
 393                 return -ETIMEDOUT;
 394         }
 395
 396         if (!xfrm6_route_forward(skb)) {
 397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 398                 goto drop;
 399         }
 400         dst = skb->dst;
 401
 402         /* IPv6 specs say nothing about it, but it is clear that we cannot
 403            send redirects to source routed frames.
 404          */
 405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 406                 struct in6_addr *target = NULL;
 407                 struct rt6_info *rt;
 408                 struct neighbour *n = dst->neighbour;
 409
 410                 /*
 411                  *      incoming and outgoing devices are the same
 412                  *      send a redirect.
 413                  */
 414
 415                 rt = (struct rt6_info *) dst;
 416                 if ((rt->rt6i_flags & RTF_GATEWAY))
 417                         target = (struct in6_addr*)&n->primary_key;
 418                 else
 419                         target = &hdr->daddr;
 420
 421                 /* Limit redirects both by destination (here)
 422                    and by source (inside ndisc_send_redirect)
 423                  */
 424                 if (xrlim_allow(dst, 1*HZ))
 425                         ndisc_send_redirect(skb, n, target);
 426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 427                                                 |IPV6_ADDR_LINKLOCAL)) {
 428                 /* This check is security critical. */
 429                 goto error;
 430         }
 431
 432         if (skb->len > dst_mtu(dst)) {
 433                 /* Again, force OUTPUT device used as source address */
 434                 skb->dev = dst->dev;
 435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 438                 kfree_skb(skb);
 439                 return -EMSGSIZE;
 440         }
 441
 442         if (skb_cow(skb, dst->dev->hard_header_len)) {
 443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 444                 goto drop;
 445         }
 446
 447         hdr = skb->nh.ipv6h;
 448
 449         /* Mangling hops number delayed to point after skb COW */
 450
 451         hdr->hop_limit--;
 452
 453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 455
 456 error:
 457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 458 drop:
 459         kfree_skb(skb);
 460         return -EINVAL;
 461 }
 462
 463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 464 {
 465         to->pkt_type = from->pkt_type;
 466         to->priority = from->priority;
 467         to->protocol = from->protocol;
 468         to->security = from->security;
 469         dst_release(to->dst);
 470         to->dst = dst_clone(from->dst);
 471         to->dev = from->dev;
 472
 473 #ifdef CONFIG_NET_SCHED
 474         to->tc_index = from->tc_index;
 475 #endif
 476 #ifdef CONFIG_NETFILTER
 477         to->nfmark = from->nfmark;
 478         /* Connection association is same as pre-frag packet */
 479         to->nfct = from->nfct;
 480         nf_conntrack_get(to->nfct);
 481         to->nfctinfo = from->nfctinfo;
 482 #ifdef CONFIG_BRIDGE_NETFILTER
 483         nf_bridge_put(to->nf_bridge);
 484         to->nf_bridge = from->nf_bridge;
 485         nf_bridge_get(to->nf_bridge);
 486 #endif
 487 #endif
 488 }
 489
 490 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 491 {
 492         u16 offset = sizeof(struct ipv6hdr);
 493         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 494         unsigned int packet_len = skb->tail - skb->nh.raw;
 495         int found_rhdr = 0;
 496         *nexthdr = &skb->nh.ipv6h->nexthdr;
 497
 498         while (offset + 1 <= packet_len) {
 499
 500                 switch (**nexthdr) {
 501
 502                 case NEXTHDR_HOP:
 503                 case NEXTHDR_ROUTING:
 504                 case NEXTHDR_DEST:
 505                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 506                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 507                         offset += ipv6_optlen(exthdr);
 508                         *nexthdr = &exthdr->nexthdr;
 509                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 510                         break;
 511                 default :
 512                         return offset;
 513                 }
 514         }
 515
 516         return offset;
 517 }
 518
 519 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 520 {
 521         struct net_device *dev;
 522         struct sk_buff *frag;
 523         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 524         struct ipv6hdr *tmp_hdr;
 525         struct frag_hdr *fh;
 526         unsigned int mtu, hlen, left, len;
 527         u32 frag_id = 0;
 528         int ptr, offset = 0, err=0;
 529         u8 *prevhdr, nexthdr = 0;
 530
 531         dev = rt->u.dst.dev;
 532         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 533         nexthdr = *prevhdr;
 534
 535         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 536
 537         if (skb_shinfo(skb)->frag_list) {
 538                 int first_len = skb_pagelen(skb);
 539
 540                 if (first_len - hlen > mtu ||
 541                     ((first_len - hlen) & 7) ||
 542                     skb_cloned(skb))
 543                         goto slow_path;
 544
 545                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 546                         /* Correct geometry. */
 547                         if (frag->len > mtu ||
 548                             ((frag->len & 7) && frag->next) ||
 549                             skb_headroom(frag) < hlen)
 550                             goto slow_path;
 551
 552                         /* Partially cloned skb? */
 553                         if (skb_shared(frag))
 554                                 goto slow_path;
 555
 556                         BUG_ON(frag->sk);
 557                         if (skb->sk) {
 558                                 sock_hold(skb->sk);
 559                                 frag->sk = skb->sk;
 560                                 frag->destructor = sock_wfree;
 561                                 skb->truesize -= frag->truesize;
 562                         }
 563                 }
 564
 565                 err = 0;
 566                 offset = 0;
 567                 frag = skb_shinfo(skb)->frag_list;
 568                 skb_shinfo(skb)->frag_list = NULL;
 569                 /* BUILD HEADER */
 570
 571                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 572                 if (!tmp_hdr) {
 573                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 574                         return -ENOMEM;
 575                 }
 576
 577                 *prevhdr = NEXTHDR_FRAGMENT;
 578                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 579                 __skb_pull(skb, hlen);
 580                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 581                 skb->nh.raw = __skb_push(skb, hlen);
 582                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 583
 584                 ipv6_select_ident(skb, fh);
 585                 fh->nexthdr = nexthdr;
 586                 fh->reserved = 0;
 587                 fh->frag_off = htons(IP6_MF);
 588                 frag_id = fh->identification;
 589
 590                 first_len = skb_pagelen(skb);
 591                 skb->data_len = first_len - skb_headlen(skb);
 592                 skb->len = first_len;
 593                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 594
 595
 596                 for (;;) {
 597                         /* Prepare header of the next frame,
 598                          * before previous one went down. */
 599                         if (frag) {
 600                                 frag->ip_summed = CHECKSUM_NONE;
 601                                 frag->h.raw = frag->data;
 602                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 603                                 frag->nh.raw = __skb_push(frag, hlen);
 604                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 605                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 606                                 fh->nexthdr = nexthdr;
 607                                 fh->reserved = 0;
 608                                 fh->frag_off = htons(offset);
 609                                 if (frag->next != NULL)
 610                                         fh->frag_off |= htons(IP6_MF);
 611                                 fh->identification = frag_id;
 612                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 613                                 ip6_copy_metadata(frag, skb);
 614                         }
 615
 616                         err = output(skb);
 617                         if (err || !frag)
 618                                 break;
 619
 620                         skb = frag;
 621                         frag = skb->next;
 622                         skb->next = NULL;
 623                 }
 624
 625                 if (tmp_hdr)
 626                         kfree(tmp_hdr);
 627
 628                 if (err == 0) {
 629                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 630                         return 0;
 631                 }
 632
 633                 while (frag) {
 634                         skb = frag->next;
 635                         kfree_skb(frag);
 636                         frag = skb;
 637                 }
 638
 639                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 640                 return err;
 641         }
 642
 643 slow_path:
 644         left = skb->len - hlen;         /* Space per frame */
 645         ptr = hlen;                     /* Where to start from */
 646
 647         /*
 648          *      Fragment the datagram.
 649          */
 650
 651         *prevhdr = NEXTHDR_FRAGMENT;
 652
 653         /*
 654          *      Keep copying data until we run out.
 655          */
 656         while(left > 0) {
 657                 len = left;
 658                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 659                 if (len > mtu)
 660                         len = mtu;
 661                 /* IF: we are not sending upto and including the packet end
 662                    then align the next start on an eight byte boundary */
 663                 if (len < left) {
 664                         len &= ~7;
 665                 }
 666                 /*
 667                  *      Allocate buffer.
 668                  */
 669
 670                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 671                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
 672                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 673                         err = -ENOMEM;
 674                         goto fail;
 675                 }
 676
 677                 /*
 678                  *      Set up data on packet
 679                  */
 680
 681                 ip6_copy_metadata(frag, skb);
 682                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 683                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 684                 frag->nh.raw = frag->data;
 685                 fh = (struct frag_hdr*)(frag->data + hlen);
 686                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 687
 688                 /*
 689                  *      Charge the memory for the fragment to any owner
 690                  *      it might possess
 691                  */
 692                 if (skb->sk)
 693                         skb_set_owner_w(frag, skb->sk);
 694
 695                 /*
 696                  *      Copy the packet header into the new buffer.
 697                  */
 698                 memcpy(frag->nh.raw, skb->data, hlen);
 699
 700                 /*
 701                  *      Build fragment header.
 702                  */
 703                 fh->nexthdr = nexthdr;
 704                 fh->reserved = 0;
 705                 if (frag_id) {
 706                         ipv6_select_ident(skb, fh);
 707                         frag_id = fh->identification;
 708                 } else
 709                         fh->identification = frag_id;
 710
 711                 /*
 712                  *      Copy a block of the IP datagram.
 713                  */
 714                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 715                         BUG();
 716                 left -= len;
 717
 718                 fh->frag_off = htons(offset);
 719                 if (left > 0)
 720                         fh->frag_off |= htons(IP6_MF);
 721                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 722
 723                 ptr += len;
 724                 offset += len;
 725
 726                 /*
 727                  *      Put this fragment into the sending queue.
 728                  */
 729
 730                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 731
 732                 err = output(frag);
 733                 if (err)
 734                         goto fail;
 735         }
 736         kfree_skb(skb);
 737         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 738         return err;
 739
 740 fail:
 741         kfree_skb(skb);
 742         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 743         return err;
 744 }
 745
 746 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 747 {
 748         int err = 0;
 749
 750         *dst = NULL;
 751         if (sk) {
 752                 struct ipv6_pinfo *np = inet6_sk(sk);
 753
 754                 *dst = sk_dst_check(sk, np->dst_cookie);
 755                 if (*dst) {
 756                         struct rt6_info *rt = (struct rt6_info*)*dst;
 757
 758                                 /* Yes, checking route validity in not connected
 759                                    case is not very simple. Take into account,
 760                                    that we do not support routing by source, TOS,
 761                                    and MSG_DONTROUTE            --ANK (980726)
 762
 763                                    1. If route was host route, check that
 764                                       cached destination is current.
 765                                       If it is network route, we still may
 766                                       check its validity using saved pointer
 767                                       to the last used address: daddr_cache.
 768                                       We do not want to save whole address now,
 769                                       (because main consumer of this service
 770                                        is tcp, which has not this problem),
 771                                       so that the last trick works only on connected
 772                                       sockets.
 773                                    2. oif also should be the same.
 774                                  */
 775
 776                         if (((rt->rt6i_dst.plen != 128 ||
 777                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 778                              && (np->daddr_cache == NULL ||
 779                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 780                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 781                                 dst_release(*dst);
 782                                 *dst = NULL;
 783                         }
 784                 }
 785         }
 786
 787         if (*dst == NULL)
 788                 *dst = ip6_route_output(sk, fl);
 789
 790         if ((err = (*dst)->error))
 791                 goto out_err_release;
 792
 793         if (ipv6_addr_any(&fl->fl6_src)) {
 794                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 795
 796                 if (err) {
 797 #if IP6_DEBUG >= 2
 798                         printk(KERN_DEBUG "ip6_dst_lookup: "
 799                                "no available source address\n");
 800 #endif
 801                         goto out_err_release;
 802                 }
 803         }
 804
 805         return 0;
 806
 807 out_err_release:
 808         dst_release(*dst);
 809         *dst = NULL;
 810         return err;
 811 }
 812
 813 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
 814                     void *from, int length, int transhdrlen,
 815                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
 816                     unsigned int flags)
 817 {
 818         struct inet_sock *inet = inet_sk(sk);
 819         struct ipv6_pinfo *np = inet6_sk(sk);
 820         struct sk_buff *skb;
 821         unsigned int maxfraglen, fragheaderlen;
 822         int exthdrlen;
 823         int hh_len;
 824         int mtu;
 825         int copy;
 826         int err;
 827         int offset = 0;
 828         int csummode = CHECKSUM_NONE;
 829
 830         if (flags&MSG_PROBE)
 831                 return 0;
 832         if (skb_queue_empty(&sk->sk_write_queue)) {
 833                 /*
 834                  * setup for corking
 835                  */
 836                 if (opt) {
 837                         if (np->cork.opt == NULL) {
 838                                 np->cork.opt = kmalloc(opt->tot_len,
 839                                                        sk->sk_allocation);
 840                                 if (unlikely(np->cork.opt == NULL))
 841                                         return -ENOBUFS;
 842                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 843                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 844                                 return -EINVAL;
 845                         }
 846                         memcpy(np->cork.opt, opt, opt->tot_len);
 847                         inet->cork.flags |= IPCORK_OPT;
 848                         /* need source address above miyazawa*/
 849                 }
 850                 dst_hold(&rt->u.dst);
 851                 np->cork.rt = rt;
 852                 inet->cork.fl = *fl;
 853                 np->cork.hop_limit = hlimit;
 854                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 855                 if (dst_allfrag(rt->u.dst.path))
 856                         inet->cork.flags |= IPCORK_ALLFRAG;
 857                 inet->cork.length = 0;
 858                 sk->sk_sndmsg_page = NULL;
 859                 sk->sk_sndmsg_off = 0;
 860                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 861                 length += exthdrlen;
 862                 transhdrlen += exthdrlen;
 863         } else {
 864                 rt = np->cork.rt;
 865                 fl = &inet->cork.fl;
 866                 if (inet->cork.flags & IPCORK_OPT)
 867                         opt = np->cork.opt;
 868                 transhdrlen = 0;
 869                 exthdrlen = 0;
 870                 mtu = inet->cork.fragsize;
 871         }
 872
 873         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 874
 875         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 876         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 877
 878         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 879                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 880                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 881                         return -EMSGSIZE;
 882                 }
 883         }
 884
 885         /*
 886          * Let's try using as much space as possible.
 887          * Use MTU if total length of the message fits into the MTU.
 888          * Otherwise, we need to reserve fragment header and
 889          * fragment alignment (= 8-15 octects, in total).
 890          *
 891          * Note that we may need to "move" the data from the tail of
 892          * of the buffer to the new fragment when we split
 893          * the message.
 894          *
 895          * FIXME: It may be fragmented into multiple chunks
 896          *        at once if non-fragmentable extension headers
 897          *        are too large.
 898          * --yoshfuji
 899          */
 900
 901         inet->cork.length += length;
 902
 903         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 904                 goto alloc_new_skb;
 905
 906         while (length > 0) {
 907                 /* Check if the remaining data fits into current packet. */
 908                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
 909                 if (copy < length)
 910                         copy = maxfraglen - skb->len;
 911
 912                 if (copy <= 0) {
 913                         char *data;
 914                         unsigned int datalen;
 915                         unsigned int fraglen;
 916                         unsigned int fraggap;
 917                         unsigned int alloclen;
 918                         struct sk_buff *skb_prev;
 919 alloc_new_skb:
 920                         skb_prev = skb;
 921
 922                         /* There's no room in the current skb */
 923                         if (skb_prev)
 924                                 fraggap = skb_prev->len - maxfraglen;
 925                         else
 926                                 fraggap = 0;
 927
 928                         /*
 929                          * If remaining data exceeds the mtu,
 930                          * we know we need more fragment(s).
 931                          */
 932                         datalen = length + fraggap;
 933                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 934                                 datalen = maxfraglen - fragheaderlen;
 935
 936                         fraglen = datalen + fragheaderlen;
 937                         if ((flags & MSG_MORE) &&
 938                             !(rt->u.dst.dev->features&NETIF_F_SG))
 939                                 alloclen = mtu;
 940                         else
 941                                 alloclen = datalen + fragheaderlen;
 942
 943                         /*
 944                          * The last fragment gets additional space at tail.
 945                          * Note: we overallocate on fragments with MSG_MODE
 946                          * because we have no idea if we're the last one.
 947                          */
 948                         if (datalen == length + fraggap)
 949                                 alloclen += rt->u.dst.trailer_len;
 950
 951                         /*
 952                          * We just reserve space for fragment header.
 953                          * Note: this may be overallocation if the message
 954                          * (without MSG_MORE) fits into the MTU.
 955                          */
 956                         alloclen += sizeof(struct frag_hdr);
 957
 958                         if (transhdrlen) {
 959                                 skb = sock_alloc_send_skb(sk,
 960                                                 alloclen + hh_len,
 961                                                 (flags & MSG_DONTWAIT), &err);
 962                         } else {
 963                                 skb = NULL;
 964                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 965                                     2 * sk->sk_sndbuf)
 966                                         skb = sock_wmalloc(sk,
 967                                                            alloclen + hh_len, 1,
 968                                                            sk->sk_allocation);
 969                                 if (unlikely(skb == NULL))
 970                                         err = -ENOBUFS;
 971                         }
 972                         if (skb == NULL)
 973                                 goto error;
 974                         /*
 975                          *      Fill in the control structures
 976                          */
 977                         skb->ip_summed = csummode;
 978                         skb->csum = 0;
 979                         /* reserve for fragmentation */
 980                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 981
 982                         /*
 983                          *      Find where to start putting bytes
 984                          */
 985                         data = skb_put(skb, fraglen);
 986                         skb->nh.raw = data + exthdrlen;
 987                         data += fragheaderlen;
 988                         skb->h.raw = data + exthdrlen;
 989
 990                         if (fraggap) {
 991                                 skb->csum = skb_copy_and_csum_bits(
 992                                         skb_prev, maxfraglen,
 993                                         data + transhdrlen, fraggap, 0);
 994                                 skb_prev->csum = csum_sub(skb_prev->csum,
 995                                                           skb->csum);
 996                                 data += fraggap;
 997                                 skb_trim(skb_prev, maxfraglen);
 998                         }
 999                         copy = datalen - transhdrlen - fraggap;
1000                         if (copy < 0) {
1001                                 err = -EINVAL;
1002                                 kfree_skb(skb);
1003                                 goto error;
1004                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1005                                 err = -EFAULT;
1006                                 kfree_skb(skb);
1007                                 goto error;
1008                         }
1009
1010                         offset += copy;
1011                         length -= datalen - fraggap;
1012                         transhdrlen = 0;
1013                         exthdrlen = 0;
1014                         csummode = CHECKSUM_NONE;
1015
1016                         /*
1017                          * Put the packet on the pending queue
1018                          */
1019                         __skb_queue_tail(&sk->sk_write_queue, skb);
1020                         continue;
1021                 }
1022
1023                 if (copy > length)
1024                         copy = length;
1025
1026                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1027                         unsigned int off;
1028
1029                         off = skb->len;
1030                         if (getfrag(from, skb_put(skb, copy),
1031                                                 offset, copy, off, skb) < 0) {
1032                                 __skb_trim(skb, off);
1033                                 err = -EFAULT;
1034                                 goto error;
1035                         }
1036                 } else {
1037                         int i = skb_shinfo(skb)->nr_frags;
1038                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1039                         struct page *page = sk->sk_sndmsg_page;
1040                         int off = sk->sk_sndmsg_off;
1041                         unsigned int left;
1042
1043                         if (page && (left = PAGE_SIZE - off) > 0) {
1044                                 if (copy >= left)
1045                                         copy = left;
1046                                 if (page != frag->page) {
1047                                         if (i == MAX_SKB_FRAGS) {
1048                                                 err = -EMSGSIZE;
1049                                                 goto error;
1050                                         }
1051                                         get_page(page);
1052                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1053                                         frag = &skb_shinfo(skb)->frags[i];
1054                                 }
1055                         } else if(i < MAX_SKB_FRAGS) {
1056                                 if (copy > PAGE_SIZE)
1057                                         copy = PAGE_SIZE;
1058                                 page = alloc_pages(sk->sk_allocation, 0);
1059                                 if (page == NULL) {
1060                                         err = -ENOMEM;
1061                                         goto error;
1062                                 }
1063                                 sk->sk_sndmsg_page = page;
1064                                 sk->sk_sndmsg_off = 0;
1065
1066                                 skb_fill_page_desc(skb, i, page, 0, 0);
1067                                 frag = &skb_shinfo(skb)->frags[i];
1068                                 skb->truesize += PAGE_SIZE;
1069                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1070                         } else {
1071                                 err = -EMSGSIZE;
1072                                 goto error;
1073                         }
1074                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1075                                 err = -EFAULT;
1076                                 goto error;
1077                         }
1078                         sk->sk_sndmsg_off += copy;
1079                         frag->size += copy;
1080                         skb->len += copy;
1081                         skb->data_len += copy;
1082                 }
1083                 offset += copy;
1084                 length -= copy;
1085         }
1086         return 0;
1087 error:
1088         inet->cork.length -= length;
1089         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1090         return err;
1091 }
1092
1093 int ip6_push_pending_frames(struct sock *sk)
1094 {
1095         struct sk_buff *skb, *tmp_skb;
1096         struct sk_buff **tail_skb;
1097         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1098         struct inet_sock *inet = inet_sk(sk);
1099         struct ipv6_pinfo *np = inet6_sk(sk);
1100         struct ipv6hdr *hdr;
1101         struct ipv6_txoptions *opt = np->cork.opt;
1102         struct rt6_info *rt = np->cork.rt;
1103         struct flowi *fl = &inet->cork.fl;
1104         unsigned char proto = fl->proto;
1105         int err = 0;
1106
1107         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1108                 goto out;
1109         tail_skb = &(skb_shinfo(skb)->frag_list);
1110
1111         /* move skb->data to ip header from ext header */
1112         if (skb->data < skb->nh.raw)
1113                 __skb_pull(skb, skb->nh.raw - skb->data);
1114         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1115                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1116                 *tail_skb = tmp_skb;
1117                 tail_skb = &(tmp_skb->next);
1118                 skb->len += tmp_skb->len;
1119                 skb->data_len += tmp_skb->len;
1120                 skb->truesize += tmp_skb->truesize;
1121                 __sock_put(tmp_skb->sk);
1122                 tmp_skb->destructor = NULL;
1123                 tmp_skb->sk = NULL;
1124         }
1125
1126         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1127         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1128         if (opt && opt->opt_flen)
1129                 ipv6_push_frag_opts(skb, opt, &proto);
1130         if (opt && opt->opt_nflen)
1131                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1132
1133         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1134
1135         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1136
1137         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1138                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1139         else
1140                 hdr->payload_len = 0;
1141         hdr->hop_limit = np->cork.hop_limit;
1142         hdr->nexthdr = proto;
1143         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1144         ipv6_addr_copy(&hdr->daddr, final_dst);
1145
1146         skb->dst = dst_clone(&rt->u.dst);
1147         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1148         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1149         if (err) {
1150                 if (err > 0)
1151                         err = np->recverr ? net_xmit_errno(err) : 0;
1152                 if (err)
1153                         goto error;
1154         }
1155
1156 out:
1157         inet->cork.flags &= ~IPCORK_OPT;
1158         if (np->cork.opt) {
1159                 kfree(np->cork.opt);
1160                 np->cork.opt = NULL;
1161         }
1162         if (np->cork.rt) {
1163                 dst_release(&np->cork.rt->u.dst);
1164                 np->cork.rt = NULL;
1165                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1166         }
1167         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1168         return err;
1169 error:
1170         goto out;
1171 }
1172
1173 void ip6_flush_pending_frames(struct sock *sk)
1174 {
1175         struct inet_sock *inet = inet_sk(sk);
1176         struct ipv6_pinfo *np = inet6_sk(sk);
1177         struct sk_buff *skb;
1178
1179         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1180                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1181                 kfree_skb(skb);
1182         }
1183
1184         inet->cork.flags &= ~IPCORK_OPT;
1185
1186         if (np->cork.opt) {
1187                 kfree(np->cork.opt);
1188                 np->cork.opt = NULL;
1189         }
1190         if (np->cork.rt) {
1191                 dst_release(&np->cork.rt->u.dst);
1192                 np->cork.rt = NULL;
1193                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1194         }
1195         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1196 }