2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
31 #include <linux/errno.h>
32 #include <linux/types.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
60 static int ip6_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*));
62 static __inline__
void ipv6_select_ident(struct sk_buff
*skb
, struct frag_hdr
*fhdr
)
64 static u32 ipv6_fragmentation_id
= 1;
65 static DEFINE_SPINLOCK(ip6_id_lock
);
67 spin_lock_bh(&ip6_id_lock
);
68 fhdr
->identification
= htonl(ipv6_fragmentation_id
);
69 if (++ipv6_fragmentation_id
== 0)
70 ipv6_fragmentation_id
= 1;
71 spin_unlock_bh(&ip6_id_lock
);
74 static inline int ip6_output_finish(struct sk_buff
*skb
)
76 struct dst_entry
*dst
= skb
->dst
;
79 return neigh_hh_output(dst
->hh
, skb
);
80 else if (dst
->neighbour
)
81 return dst
->neighbour
->output(skb
);
83 IP6_INC_STATS_BH(ip6_dst_idev(dst
), IPSTATS_MIB_OUTNOROUTES
);
89 /* dev_loopback_xmit for use with netfilter. */
90 static int ip6_dev_loopback_xmit(struct sk_buff
*newskb
)
92 skb_reset_mac_header(newskb
);
93 __skb_pull(newskb
, skb_network_offset(newskb
));
94 newskb
->pkt_type
= PACKET_LOOPBACK
;
95 newskb
->ip_summed
= CHECKSUM_UNNECESSARY
;
96 BUG_TRAP(newskb
->dst
);
103 static int ip6_output2(struct sk_buff
*skb
)
105 struct dst_entry
*dst
= skb
->dst
;
106 struct net_device
*dev
= dst
->dev
;
108 skb
->protocol
= htons(ETH_P_IPV6
);
111 if (ipv6_addr_is_multicast(&ipv6_hdr(skb
)->daddr
)) {
112 struct ipv6_pinfo
* np
= skb
->sk
? inet6_sk(skb
->sk
) : NULL
;
113 struct inet6_dev
*idev
= ip6_dst_idev(skb
->dst
);
115 if (!(dev
->flags
& IFF_LOOPBACK
) && (!np
|| np
->mc_loop
) &&
116 ((mroute6_socket
&& !(IP6CB(skb
)->flags
& IP6SKB_FORWARDED
)) ||
117 ipv6_chk_mcast_addr(dev
, &ipv6_hdr(skb
)->daddr
,
118 &ipv6_hdr(skb
)->saddr
))) {
119 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
121 /* Do not check for IFF_ALLMULTI; multicast routing
122 is not supported in any case.
125 NF_HOOK(PF_INET6
, NF_IP6_POST_ROUTING
, newskb
, NULL
,
127 ip6_dev_loopback_xmit
);
129 if (ipv6_hdr(skb
)->hop_limit
== 0) {
130 IP6_INC_STATS(idev
, IPSTATS_MIB_OUTDISCARDS
);
136 IP6_INC_STATS(idev
, IPSTATS_MIB_OUTMCASTPKTS
);
139 return NF_HOOK(PF_INET6
, NF_IP6_POST_ROUTING
, skb
,NULL
, skb
->dev
,ip6_output_finish
);
142 static inline int ip6_skb_dst_mtu(struct sk_buff
*skb
)
144 struct ipv6_pinfo
*np
= skb
->sk
? inet6_sk(skb
->sk
) : NULL
;
146 return (np
&& np
->pmtudisc
== IPV6_PMTUDISC_PROBE
) ?
147 skb
->dst
->dev
->mtu
: dst_mtu(skb
->dst
);
150 int ip6_output(struct sk_buff
*skb
)
152 struct inet6_dev
*idev
= ip6_dst_idev(skb
->dst
);
153 if (unlikely(idev
->cnf
.disable_ipv6
)) {
154 IP6_INC_STATS(idev
, IPSTATS_MIB_OUTDISCARDS
);
159 if ((skb
->len
> ip6_skb_dst_mtu(skb
) && !skb_is_gso(skb
)) ||
160 dst_allfrag(skb
->dst
))
161 return ip6_fragment(skb
, ip6_output2
);
163 return ip6_output2(skb
);
167 * xmit an sk_buff (used by TCP)
170 int ip6_xmit(struct sock
*sk
, struct sk_buff
*skb
, struct flowi
*fl
,
171 struct ipv6_txoptions
*opt
, int ipfragok
)
173 struct ipv6_pinfo
*np
= inet6_sk(sk
);
174 struct in6_addr
*first_hop
= &fl
->fl6_dst
;
175 struct dst_entry
*dst
= skb
->dst
;
177 u8 proto
= fl
->proto
;
178 int seg_len
= skb
->len
;
185 /* First: exthdrs may take lots of space (~8K for now)
186 MAX_HEADER is not enough.
188 head_room
= opt
->opt_nflen
+ opt
->opt_flen
;
189 seg_len
+= head_room
;
190 head_room
+= sizeof(struct ipv6hdr
) + LL_RESERVED_SPACE(dst
->dev
);
192 if (skb_headroom(skb
) < head_room
) {
193 struct sk_buff
*skb2
= skb_realloc_headroom(skb
, head_room
);
195 IP6_INC_STATS(ip6_dst_idev(skb
->dst
),
196 IPSTATS_MIB_OUTDISCARDS
);
203 skb_set_owner_w(skb
, sk
);
206 ipv6_push_frag_opts(skb
, opt
, &proto
);
208 ipv6_push_nfrag_opts(skb
, opt
, &proto
, &first_hop
);
211 skb_push(skb
, sizeof(struct ipv6hdr
));
212 skb_reset_network_header(skb
);
216 * Fill in the IPv6 header
221 hlimit
= np
->hop_limit
;
223 hlimit
= dst_metric(dst
, RTAX_HOPLIMIT
);
225 hlimit
= ipv6_get_hoplimit(dst
->dev
);
233 *(__be32
*)hdr
= htonl(0x60000000 | (tclass
<< 20)) | fl
->fl6_flowlabel
;
235 hdr
->payload_len
= htons(seg_len
);
236 hdr
->nexthdr
= proto
;
237 hdr
->hop_limit
= hlimit
;
239 ipv6_addr_copy(&hdr
->saddr
, &fl
->fl6_src
);
240 ipv6_addr_copy(&hdr
->daddr
, first_hop
);
242 skb
->priority
= sk
->sk_priority
;
245 if ((skb
->len
<= mtu
) || ipfragok
|| skb_is_gso(skb
)) {
246 IP6_INC_STATS(ip6_dst_idev(skb
->dst
),
247 IPSTATS_MIB_OUTREQUESTS
);
248 return NF_HOOK(PF_INET6
, NF_IP6_LOCAL_OUT
, skb
, NULL
, dst
->dev
,
253 printk(KERN_DEBUG
"IPv6: sending pkt_too_big to self\n");
255 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
, skb
->dev
);
256 IP6_INC_STATS(ip6_dst_idev(skb
->dst
), IPSTATS_MIB_FRAGFAILS
);
261 EXPORT_SYMBOL(ip6_xmit
);
264 * To avoid extra problems ND packets are send through this
265 * routine. It's code duplication but I really want to avoid
266 * extra checks since ipv6_build_header is used by TCP (which
267 * is for us performance critical)
270 int ip6_nd_hdr(struct sock
*sk
, struct sk_buff
*skb
, struct net_device
*dev
,
271 struct in6_addr
*saddr
, struct in6_addr
*daddr
,
274 struct ipv6_pinfo
*np
= inet6_sk(sk
);
278 skb
->protocol
= htons(ETH_P_IPV6
);
281 totlen
= len
+ sizeof(struct ipv6hdr
);
283 skb_reset_network_header(skb
);
284 skb_put(skb
, sizeof(struct ipv6hdr
));
287 *(__be32
*)hdr
= htonl(0x60000000);
289 hdr
->payload_len
= htons(len
);
290 hdr
->nexthdr
= proto
;
291 hdr
->hop_limit
= np
->hop_limit
;
293 ipv6_addr_copy(&hdr
->saddr
, saddr
);
294 ipv6_addr_copy(&hdr
->daddr
, daddr
);
299 static int ip6_call_ra_chain(struct sk_buff
*skb
, int sel
)
301 struct ip6_ra_chain
*ra
;
302 struct sock
*last
= NULL
;
304 read_lock(&ip6_ra_lock
);
305 for (ra
= ip6_ra_chain
; ra
; ra
= ra
->next
) {
306 struct sock
*sk
= ra
->sk
;
307 if (sk
&& ra
->sel
== sel
&&
308 (!sk
->sk_bound_dev_if
||
309 sk
->sk_bound_dev_if
== skb
->dev
->ifindex
)) {
311 struct sk_buff
*skb2
= skb_clone(skb
, GFP_ATOMIC
);
313 rawv6_rcv(last
, skb2
);
320 rawv6_rcv(last
, skb
);
321 read_unlock(&ip6_ra_lock
);
324 read_unlock(&ip6_ra_lock
);
328 static int ip6_forward_proxy_check(struct sk_buff
*skb
)
330 struct ipv6hdr
*hdr
= ipv6_hdr(skb
);
331 u8 nexthdr
= hdr
->nexthdr
;
334 if (ipv6_ext_hdr(nexthdr
)) {
335 offset
= ipv6_skip_exthdr(skb
, sizeof(*hdr
), &nexthdr
);
339 offset
= sizeof(struct ipv6hdr
);
341 if (nexthdr
== IPPROTO_ICMPV6
) {
342 struct icmp6hdr
*icmp6
;
344 if (!pskb_may_pull(skb
, (skb_network_header(skb
) +
345 offset
+ 1 - skb
->data
)))
348 icmp6
= (struct icmp6hdr
*)(skb_network_header(skb
) + offset
);
350 switch (icmp6
->icmp6_type
) {
351 case NDISC_ROUTER_SOLICITATION
:
352 case NDISC_ROUTER_ADVERTISEMENT
:
353 case NDISC_NEIGHBOUR_SOLICITATION
:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT
:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr
->daddr
) & IPV6_ADDR_LINKLOCAL
) {
372 dst_link_failure(skb
);
379 static inline int ip6_forward_finish(struct sk_buff
*skb
)
381 return dst_output(skb
);
384 int ip6_forward(struct sk_buff
*skb
)
386 struct dst_entry
*dst
= skb
->dst
;
387 struct ipv6hdr
*hdr
= ipv6_hdr(skb
);
388 struct inet6_skb_parm
*opt
= IP6CB(skb
);
390 if (ipv6_devconf
.forwarding
== 0)
393 if (!xfrm6_policy_check(NULL
, XFRM_POLICY_FWD
, skb
)) {
394 IP6_INC_STATS(ip6_dst_idev(dst
), IPSTATS_MIB_INDISCARDS
);
398 skb_forward_csum(skb
);
401 * We DO NOT make any processing on
402 * RA packets, pushing them to user level AS IS
403 * without ane WARRANTY that application will be able
404 * to interpret them. The reason is that we
405 * cannot make anything clever here.
407 * We are not end-node, so that if packet contains
408 * AH/ESP, we cannot make anything.
409 * Defragmentation also would be mistake, RA packets
410 * cannot be fragmented, because there is no warranty
411 * that different fragments will go along one path. --ANK
414 u8
*ptr
= skb_network_header(skb
) + opt
->ra
;
415 if (ip6_call_ra_chain(skb
, (ptr
[2]<<8) + ptr
[3]))
420 * check and decrement ttl
422 if (hdr
->hop_limit
<= 1) {
423 /* Force OUTPUT device used as source address */
425 icmpv6_send(skb
, ICMPV6_TIME_EXCEED
, ICMPV6_EXC_HOPLIMIT
,
427 IP6_INC_STATS_BH(ip6_dst_idev(dst
), IPSTATS_MIB_INHDRERRORS
);
433 /* XXX: idev->cnf.proxy_ndp? */
434 if (ipv6_devconf
.proxy_ndp
&&
435 pneigh_lookup(&nd_tbl
, &hdr
->daddr
, skb
->dev
, 0)) {
436 int proxied
= ip6_forward_proxy_check(skb
);
438 return ip6_input(skb
);
439 else if (proxied
< 0) {
440 IP6_INC_STATS(ip6_dst_idev(dst
), IPSTATS_MIB_INDISCARDS
);
445 if (!xfrm6_route_forward(skb
)) {
446 IP6_INC_STATS(ip6_dst_idev(dst
), IPSTATS_MIB_INDISCARDS
);
451 /* IPv6 specs say nothing about it, but it is clear that we cannot
452 send redirects to source routed frames.
453 We don't send redirects to frames decapsulated from IPsec.
455 if (skb
->dev
== dst
->dev
&& dst
->neighbour
&& opt
->srcrt
== 0 &&
457 struct in6_addr
*target
= NULL
;
459 struct neighbour
*n
= dst
->neighbour
;
462 * incoming and outgoing devices are the same
466 rt
= (struct rt6_info
*) dst
;
467 if ((rt
->rt6i_flags
& RTF_GATEWAY
))
468 target
= (struct in6_addr
*)&n
->primary_key
;
470 target
= &hdr
->daddr
;
472 /* Limit redirects both by destination (here)
473 and by source (inside ndisc_send_redirect)
475 if (xrlim_allow(dst
, 1*HZ
))
476 ndisc_send_redirect(skb
, n
, target
);
478 int addrtype
= ipv6_addr_type(&hdr
->saddr
);
480 /* This check is security critical. */
481 if (addrtype
& (IPV6_ADDR_MULTICAST
|IPV6_ADDR_LOOPBACK
))
483 if (addrtype
& IPV6_ADDR_LINKLOCAL
) {
484 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
,
485 ICMPV6_NOT_NEIGHBOUR
, 0, skb
->dev
);
490 if (skb
->len
> dst_mtu(dst
) && !skb_is_gso(skb
)) {
491 /* Again, force OUTPUT device used as source address */
493 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, dst_mtu(dst
), skb
->dev
);
494 IP6_INC_STATS_BH(ip6_dst_idev(dst
), IPSTATS_MIB_INTOOBIGERRORS
);
495 IP6_INC_STATS_BH(ip6_dst_idev(dst
), IPSTATS_MIB_FRAGFAILS
);
500 if (skb_cow(skb
, dst
->dev
->hard_header_len
)) {
501 IP6_INC_STATS(ip6_dst_idev(dst
), IPSTATS_MIB_OUTDISCARDS
);
507 /* Mangling hops number delayed to point after skb COW */
511 IP6_INC_STATS_BH(ip6_dst_idev(dst
), IPSTATS_MIB_OUTFORWDATAGRAMS
);
512 return NF_HOOK(PF_INET6
,NF_IP6_FORWARD
, skb
, skb
->dev
, dst
->dev
, ip6_forward_finish
);
515 IP6_INC_STATS_BH(ip6_dst_idev(dst
), IPSTATS_MIB_INADDRERRORS
);
521 static void ip6_copy_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
523 to
->pkt_type
= from
->pkt_type
;
524 to
->priority
= from
->priority
;
525 to
->protocol
= from
->protocol
;
526 dst_release(to
->dst
);
527 to
->dst
= dst_clone(from
->dst
);
529 to
->mark
= from
->mark
;
531 #ifdef CONFIG_NET_SCHED
532 to
->tc_index
= from
->tc_index
;
535 skb_copy_secmark(to
, from
);
538 int ip6_find_1stfragopt(struct sk_buff
*skb
, u8
**nexthdr
)
540 u16 offset
= sizeof(struct ipv6hdr
);
541 struct ipv6_opt_hdr
*exthdr
=
542 (struct ipv6_opt_hdr
*)(ipv6_hdr(skb
) + 1);
543 unsigned int packet_len
= skb
->tail
- skb
->network_header
;
545 *nexthdr
= &ipv6_hdr(skb
)->nexthdr
;
547 while (offset
+ 1 <= packet_len
) {
553 case NEXTHDR_ROUTING
:
557 #ifdef CONFIG_IPV6_MIP6
558 if (ipv6_find_tlv(skb
, offset
, IPV6_TLV_HAO
) >= 0)
568 offset
+= ipv6_optlen(exthdr
);
569 *nexthdr
= &exthdr
->nexthdr
;
570 exthdr
= (struct ipv6_opt_hdr
*)(skb_network_header(skb
) +
576 EXPORT_SYMBOL_GPL(ip6_find_1stfragopt
);
578 static int ip6_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*))
580 struct net_device
*dev
;
581 struct sk_buff
*frag
;
582 struct rt6_info
*rt
= (struct rt6_info
*)skb
->dst
;
583 struct ipv6_pinfo
*np
= skb
->sk
? inet6_sk(skb
->sk
) : NULL
;
584 struct ipv6hdr
*tmp_hdr
;
586 unsigned int mtu
, hlen
, left
, len
;
588 int ptr
, offset
= 0, err
=0;
589 u8
*prevhdr
, nexthdr
= 0;
592 hlen
= ip6_find_1stfragopt(skb
, &prevhdr
);
595 mtu
= ip6_skb_dst_mtu(skb
);
597 /* We must not fragment if the socket is set to force MTU discovery
598 * or if the skb it not generated by a local socket. (This last
599 * check should be redundant, but it's free.)
601 if (!np
|| np
->pmtudisc
>= IPV6_PMTUDISC_DO
) {
602 skb
->dev
= skb
->dst
->dev
;
603 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
, skb
->dev
);
604 IP6_INC_STATS(ip6_dst_idev(skb
->dst
), IPSTATS_MIB_FRAGFAILS
);
609 if (np
&& np
->frag_size
< mtu
) {
613 mtu
-= hlen
+ sizeof(struct frag_hdr
);
615 if (skb_shinfo(skb
)->frag_list
) {
616 int first_len
= skb_pagelen(skb
);
619 if (first_len
- hlen
> mtu
||
620 ((first_len
- hlen
) & 7) ||
624 for (frag
= skb_shinfo(skb
)->frag_list
; frag
; frag
= frag
->next
) {
625 /* Correct geometry. */
626 if (frag
->len
> mtu
||
627 ((frag
->len
& 7) && frag
->next
) ||
628 skb_headroom(frag
) < hlen
)
631 /* Partially cloned skb? */
632 if (skb_shared(frag
))
639 frag
->destructor
= sock_wfree
;
640 truesizes
+= frag
->truesize
;
646 frag
= skb_shinfo(skb
)->frag_list
;
647 skb_shinfo(skb
)->frag_list
= NULL
;
650 *prevhdr
= NEXTHDR_FRAGMENT
;
651 tmp_hdr
= kmemdup(skb_network_header(skb
), hlen
, GFP_ATOMIC
);
653 IP6_INC_STATS(ip6_dst_idev(skb
->dst
), IPSTATS_MIB_FRAGFAILS
);
657 __skb_pull(skb
, hlen
);
658 fh
= (struct frag_hdr
*)__skb_push(skb
, sizeof(struct frag_hdr
));
659 __skb_push(skb
, hlen
);
660 skb_reset_network_header(skb
);
661 memcpy(skb_network_header(skb
), tmp_hdr
, hlen
);
663 ipv6_select_ident(skb
, fh
);
664 fh
->nexthdr
= nexthdr
;
666 fh
->frag_off
= htons(IP6_MF
);
667 frag_id
= fh
->identification
;
669 first_len
= skb_pagelen(skb
);
670 skb
->data_len
= first_len
- skb_headlen(skb
);
671 skb
->truesize
-= truesizes
;
672 skb
->len
= first_len
;
673 ipv6_hdr(skb
)->payload_len
= htons(first_len
-
674 sizeof(struct ipv6hdr
));
676 dst_hold(&rt
->u
.dst
);
679 /* Prepare header of the next frame,
680 * before previous one went down. */
682 frag
->ip_summed
= CHECKSUM_NONE
;
683 skb_reset_transport_header(frag
);
684 fh
= (struct frag_hdr
*)__skb_push(frag
, sizeof(struct frag_hdr
));
685 __skb_push(frag
, hlen
);
686 skb_reset_network_header(frag
);
687 memcpy(skb_network_header(frag
), tmp_hdr
,
689 offset
+= skb
->len
- hlen
- sizeof(struct frag_hdr
);
690 fh
->nexthdr
= nexthdr
;
692 fh
->frag_off
= htons(offset
);
693 if (frag
->next
!= NULL
)
694 fh
->frag_off
|= htons(IP6_MF
);
695 fh
->identification
= frag_id
;
696 ipv6_hdr(frag
)->payload_len
=
698 sizeof(struct ipv6hdr
));
699 ip6_copy_metadata(frag
, skb
);
704 IP6_INC_STATS(ip6_dst_idev(&rt
->u
.dst
), IPSTATS_MIB_FRAGCREATES
);
717 IP6_INC_STATS(ip6_dst_idev(&rt
->u
.dst
), IPSTATS_MIB_FRAGOKS
);
718 dst_release(&rt
->u
.dst
);
728 IP6_INC_STATS(ip6_dst_idev(&rt
->u
.dst
), IPSTATS_MIB_FRAGFAILS
);
729 dst_release(&rt
->u
.dst
);
734 left
= skb
->len
- hlen
; /* Space per frame */
735 ptr
= hlen
; /* Where to start from */
738 * Fragment the datagram.
741 *prevhdr
= NEXTHDR_FRAGMENT
;
744 * Keep copying data until we run out.
748 /* IF: it doesn't fit, use 'mtu' - the data space left */
751 /* IF: we are not sending upto and including the packet end
752 then align the next start on an eight byte boundary */
760 if ((frag
= alloc_skb(len
+hlen
+sizeof(struct frag_hdr
)+LL_RESERVED_SPACE(rt
->u
.dst
.dev
), GFP_ATOMIC
)) == NULL
) {
761 NETDEBUG(KERN_INFO
"IPv6: frag: no memory for new fragment!\n");
762 IP6_INC_STATS(ip6_dst_idev(skb
->dst
),
763 IPSTATS_MIB_FRAGFAILS
);
769 * Set up data on packet
772 ip6_copy_metadata(frag
, skb
);
773 skb_reserve(frag
, LL_RESERVED_SPACE(rt
->u
.dst
.dev
));
774 skb_put(frag
, len
+ hlen
+ sizeof(struct frag_hdr
));
775 skb_reset_network_header(frag
);
776 fh
= (struct frag_hdr
*)(skb_network_header(frag
) + hlen
);
777 frag
->transport_header
= (frag
->network_header
+ hlen
+
778 sizeof(struct frag_hdr
));
781 * Charge the memory for the fragment to any owner
785 skb_set_owner_w(frag
, skb
->sk
);
788 * Copy the packet header into the new buffer.
790 skb_copy_from_linear_data(skb
, skb_network_header(frag
), hlen
);
793 * Build fragment header.
795 fh
->nexthdr
= nexthdr
;
798 ipv6_select_ident(skb
, fh
);
799 frag_id
= fh
->identification
;
801 fh
->identification
= frag_id
;
804 * Copy a block of the IP datagram.
806 if (skb_copy_bits(skb
, ptr
, skb_transport_header(frag
), len
))
810 fh
->frag_off
= htons(offset
);
812 fh
->frag_off
|= htons(IP6_MF
);
813 ipv6_hdr(frag
)->payload_len
= htons(frag
->len
-
814 sizeof(struct ipv6hdr
));
820 * Put this fragment into the sending queue.
826 IP6_INC_STATS(ip6_dst_idev(skb
->dst
), IPSTATS_MIB_FRAGCREATES
);
828 IP6_INC_STATS(ip6_dst_idev(skb
->dst
),
829 IPSTATS_MIB_FRAGOKS
);
834 IP6_INC_STATS(ip6_dst_idev(skb
->dst
),
835 IPSTATS_MIB_FRAGFAILS
);
840 static inline int ip6_rt_check(struct rt6key
*rt_key
,
841 struct in6_addr
*fl_addr
,
842 struct in6_addr
*addr_cache
)
844 return ((rt_key
->plen
!= 128 || !ipv6_addr_equal(fl_addr
, &rt_key
->addr
)) &&
845 (addr_cache
== NULL
|| !ipv6_addr_equal(fl_addr
, addr_cache
)));
848 static struct dst_entry
*ip6_sk_dst_check(struct sock
*sk
,
849 struct dst_entry
*dst
,
852 struct ipv6_pinfo
*np
= inet6_sk(sk
);
853 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
858 /* Yes, checking route validity in not connected
859 * case is not very simple. Take into account,
860 * that we do not support routing by source, TOS,
861 * and MSG_DONTROUTE --ANK (980726)
863 * 1. ip6_rt_check(): If route was host route,
864 * check that cached destination is current.
865 * If it is network route, we still may
866 * check its validity using saved pointer
867 * to the last used address: daddr_cache.
868 * We do not want to save whole address now,
869 * (because main consumer of this service
870 * is tcp, which has not this problem),
871 * so that the last trick works only on connected
873 * 2. oif also should be the same.
875 if (ip6_rt_check(&rt
->rt6i_dst
, &fl
->fl6_dst
, np
->daddr_cache
) ||
876 #ifdef CONFIG_IPV6_SUBTREES
877 ip6_rt_check(&rt
->rt6i_src
, &fl
->fl6_src
, np
->saddr_cache
) ||
879 (fl
->oif
&& fl
->oif
!= dst
->dev
->ifindex
)) {
888 static int ip6_dst_lookup_tail(struct sock
*sk
,
889 struct dst_entry
**dst
, struct flowi
*fl
)
894 *dst
= ip6_route_output(sk
, fl
);
896 if ((err
= (*dst
)->error
))
897 goto out_err_release
;
899 if (ipv6_addr_any(&fl
->fl6_src
)) {
900 err
= ipv6_get_saddr(*dst
, &fl
->fl6_dst
, &fl
->fl6_src
);
902 goto out_err_release
;
905 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
907 * Here if the dst entry we've looked up
908 * has a neighbour entry that is in the INCOMPLETE
909 * state and the src address from the flow is
910 * marked as OPTIMISTIC, we release the found
911 * dst entry and replace it instead with the
912 * dst entry of the nexthop router
914 if (!((*dst
)->neighbour
->nud_state
& NUD_VALID
)) {
915 struct inet6_ifaddr
*ifp
;
919 ifp
= ipv6_get_ifaddr(&fl
->fl6_src
, (*dst
)->dev
, 1);
921 redirect
= (ifp
&& ifp
->flags
& IFA_F_OPTIMISTIC
);
927 * We need to get the dst entry for the
928 * default router instead
931 memcpy(&fl_gw
, fl
, sizeof(struct flowi
));
932 memset(&fl_gw
.fl6_dst
, 0, sizeof(struct in6_addr
));
933 *dst
= ip6_route_output(sk
, &fl_gw
);
934 if ((err
= (*dst
)->error
))
935 goto out_err_release
;
949 * ip6_dst_lookup - perform route lookup on flow
950 * @sk: socket which provides route info
951 * @dst: pointer to dst_entry * for result
952 * @fl: flow to lookup
954 * This function performs a route lookup on the given flow.
956 * It returns zero on success, or a standard errno code on error.
958 int ip6_dst_lookup(struct sock
*sk
, struct dst_entry
**dst
, struct flowi
*fl
)
961 return ip6_dst_lookup_tail(sk
, dst
, fl
);
963 EXPORT_SYMBOL_GPL(ip6_dst_lookup
);
966 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
967 * @sk: socket which provides the dst cache and route info
968 * @dst: pointer to dst_entry * for result
969 * @fl: flow to lookup
971 * This function performs a route lookup on the given flow with the
972 * possibility of using the cached route in the socket if it is valid.
973 * It will take the socket dst lock when operating on the dst cache.
974 * As a result, this function can only be used in process context.
976 * It returns zero on success, or a standard errno code on error.
978 int ip6_sk_dst_lookup(struct sock
*sk
, struct dst_entry
**dst
, struct flowi
*fl
)
982 *dst
= sk_dst_check(sk
, inet6_sk(sk
)->dst_cookie
);
983 *dst
= ip6_sk_dst_check(sk
, *dst
, fl
);
986 return ip6_dst_lookup_tail(sk
, dst
, fl
);
988 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup
);
990 static inline int ip6_ufo_append_data(struct sock
*sk
,
991 int getfrag(void *from
, char *to
, int offset
, int len
,
992 int odd
, struct sk_buff
*skb
),
993 void *from
, int length
, int hh_len
, int fragheaderlen
,
994 int transhdrlen
, int mtu
,unsigned int flags
)
1000 /* There is support for UDP large send offload by network
1001 * device, so create one single skb packet containing complete
1004 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
) {
1005 skb
= sock_alloc_send_skb(sk
,
1006 hh_len
+ fragheaderlen
+ transhdrlen
+ 20,
1007 (flags
& MSG_DONTWAIT
), &err
);
1011 /* reserve space for Hardware header */
1012 skb_reserve(skb
, hh_len
);
1014 /* create space for UDP/IP header */
1015 skb_put(skb
,fragheaderlen
+ transhdrlen
);
1017 /* initialize network header pointer */
1018 skb_reset_network_header(skb
);
1020 /* initialize protocol header pointer */
1021 skb
->transport_header
= skb
->network_header
+ fragheaderlen
;
1023 skb
->ip_summed
= CHECKSUM_PARTIAL
;
1025 sk
->sk_sndmsg_off
= 0;
1028 err
= skb_append_datato_frags(sk
,skb
, getfrag
, from
,
1029 (length
- transhdrlen
));
1031 struct frag_hdr fhdr
;
1033 /* specify the length of each IP datagram fragment*/
1034 skb_shinfo(skb
)->gso_size
= mtu
- fragheaderlen
-
1035 sizeof(struct frag_hdr
);
1036 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
1037 ipv6_select_ident(skb
, &fhdr
);
1038 skb_shinfo(skb
)->ip6_frag_id
= fhdr
.identification
;
1039 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1043 /* There is not enough support do UPD LSO,
1044 * so follow normal path
1051 int ip6_append_data(struct sock
*sk
, int getfrag(void *from
, char *to
,
1052 int offset
, int len
, int odd
, struct sk_buff
*skb
),
1053 void *from
, int length
, int transhdrlen
,
1054 int hlimit
, int tclass
, struct ipv6_txoptions
*opt
, struct flowi
*fl
,
1055 struct rt6_info
*rt
, unsigned int flags
)
1057 struct inet_sock
*inet
= inet_sk(sk
);
1058 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1059 struct sk_buff
*skb
;
1060 unsigned int maxfraglen
, fragheaderlen
;
1067 int csummode
= CHECKSUM_NONE
;
1069 if (flags
&MSG_PROBE
)
1071 if (skb_queue_empty(&sk
->sk_write_queue
)) {
1076 if (np
->cork
.opt
== NULL
) {
1077 np
->cork
.opt
= kmalloc(opt
->tot_len
,
1079 if (unlikely(np
->cork
.opt
== NULL
))
1081 } else if (np
->cork
.opt
->tot_len
< opt
->tot_len
) {
1082 printk(KERN_DEBUG
"ip6_append_data: invalid option length\n");
1085 memcpy(np
->cork
.opt
, opt
, opt
->tot_len
);
1086 inet
->cork
.flags
|= IPCORK_OPT
;
1087 /* need source address above miyazawa*/
1089 dst_hold(&rt
->u
.dst
);
1091 inet
->cork
.fl
= *fl
;
1092 np
->cork
.hop_limit
= hlimit
;
1093 np
->cork
.tclass
= tclass
;
1094 mtu
= np
->pmtudisc
== IPV6_PMTUDISC_PROBE
?
1095 rt
->u
.dst
.dev
->mtu
: dst_mtu(rt
->u
.dst
.path
);
1096 if (np
->frag_size
< mtu
) {
1098 mtu
= np
->frag_size
;
1100 inet
->cork
.fragsize
= mtu
;
1101 if (dst_allfrag(rt
->u
.dst
.path
))
1102 inet
->cork
.flags
|= IPCORK_ALLFRAG
;
1103 inet
->cork
.length
= 0;
1104 sk
->sk_sndmsg_page
= NULL
;
1105 sk
->sk_sndmsg_off
= 0;
1106 exthdrlen
= rt
->u
.dst
.header_len
+ (opt
? opt
->opt_flen
: 0);
1107 length
+= exthdrlen
;
1108 transhdrlen
+= exthdrlen
;
1111 fl
= &inet
->cork
.fl
;
1112 if (inet
->cork
.flags
& IPCORK_OPT
)
1116 mtu
= inet
->cork
.fragsize
;
1119 hh_len
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
);
1121 fragheaderlen
= sizeof(struct ipv6hdr
) + rt
->u
.dst
.nfheader_len
+ (opt
? opt
->opt_nflen
: 0);
1122 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
- sizeof(struct frag_hdr
);
1124 if (mtu
<= sizeof(struct ipv6hdr
) + IPV6_MAXPLEN
) {
1125 if (inet
->cork
.length
+ length
> sizeof(struct ipv6hdr
) + IPV6_MAXPLEN
- fragheaderlen
) {
1126 ipv6_local_error(sk
, EMSGSIZE
, fl
, mtu
-exthdrlen
);
1132 * Let's try using as much space as possible.
1133 * Use MTU if total length of the message fits into the MTU.
1134 * Otherwise, we need to reserve fragment header and
1135 * fragment alignment (= 8-15 octects, in total).
1137 * Note that we may need to "move" the data from the tail of
1138 * of the buffer to the new fragment when we split
1141 * FIXME: It may be fragmented into multiple chunks
1142 * at once if non-fragmentable extension headers
1147 inet
->cork
.length
+= length
;
1148 if (((length
> mtu
) && (sk
->sk_protocol
== IPPROTO_UDP
)) &&
1149 (rt
->u
.dst
.dev
->features
& NETIF_F_UFO
)) {
1151 err
= ip6_ufo_append_data(sk
, getfrag
, from
, length
, hh_len
,
1152 fragheaderlen
, transhdrlen
, mtu
,
1159 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
1162 while (length
> 0) {
1163 /* Check if the remaining data fits into current packet. */
1164 copy
= (inet
->cork
.length
<= mtu
&& !(inet
->cork
.flags
& IPCORK_ALLFRAG
) ? mtu
: maxfraglen
) - skb
->len
;
1166 copy
= maxfraglen
- skb
->len
;
1170 unsigned int datalen
;
1171 unsigned int fraglen
;
1172 unsigned int fraggap
;
1173 unsigned int alloclen
;
1174 struct sk_buff
*skb_prev
;
1178 /* There's no room in the current skb */
1180 fraggap
= skb_prev
->len
- maxfraglen
;
1185 * If remaining data exceeds the mtu,
1186 * we know we need more fragment(s).
1188 datalen
= length
+ fraggap
;
1189 if (datalen
> (inet
->cork
.length
<= mtu
&& !(inet
->cork
.flags
& IPCORK_ALLFRAG
) ? mtu
: maxfraglen
) - fragheaderlen
)
1190 datalen
= maxfraglen
- fragheaderlen
;
1192 fraglen
= datalen
+ fragheaderlen
;
1193 if ((flags
& MSG_MORE
) &&
1194 !(rt
->u
.dst
.dev
->features
&NETIF_F_SG
))
1197 alloclen
= datalen
+ fragheaderlen
;
1200 * The last fragment gets additional space at tail.
1201 * Note: we overallocate on fragments with MSG_MODE
1202 * because we have no idea if we're the last one.
1204 if (datalen
== length
+ fraggap
)
1205 alloclen
+= rt
->u
.dst
.trailer_len
;
1208 * We just reserve space for fragment header.
1209 * Note: this may be overallocation if the message
1210 * (without MSG_MORE) fits into the MTU.
1212 alloclen
+= sizeof(struct frag_hdr
);
1215 skb
= sock_alloc_send_skb(sk
,
1217 (flags
& MSG_DONTWAIT
), &err
);
1220 if (atomic_read(&sk
->sk_wmem_alloc
) <=
1222 skb
= sock_wmalloc(sk
,
1223 alloclen
+ hh_len
, 1,
1225 if (unlikely(skb
== NULL
))
1231 * Fill in the control structures
1233 skb
->ip_summed
= csummode
;
1235 /* reserve for fragmentation */
1236 skb_reserve(skb
, hh_len
+sizeof(struct frag_hdr
));
1239 * Find where to start putting bytes
1241 data
= skb_put(skb
, fraglen
);
1242 skb_set_network_header(skb
, exthdrlen
);
1243 data
+= fragheaderlen
;
1244 skb
->transport_header
= (skb
->network_header
+
1247 skb
->csum
= skb_copy_and_csum_bits(
1248 skb_prev
, maxfraglen
,
1249 data
+ transhdrlen
, fraggap
, 0);
1250 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
1253 pskb_trim_unique(skb_prev
, maxfraglen
);
1255 copy
= datalen
- transhdrlen
- fraggap
;
1260 } else if (copy
> 0 && getfrag(from
, data
+ transhdrlen
, offset
, copy
, fraggap
, skb
) < 0) {
1267 length
-= datalen
- fraggap
;
1270 csummode
= CHECKSUM_NONE
;
1273 * Put the packet on the pending queue
1275 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1282 if (!(rt
->u
.dst
.dev
->features
&NETIF_F_SG
)) {
1286 if (getfrag(from
, skb_put(skb
, copy
),
1287 offset
, copy
, off
, skb
) < 0) {
1288 __skb_trim(skb
, off
);
1293 int i
= skb_shinfo(skb
)->nr_frags
;
1294 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
-1];
1295 struct page
*page
= sk
->sk_sndmsg_page
;
1296 int off
= sk
->sk_sndmsg_off
;
1299 if (page
&& (left
= PAGE_SIZE
- off
) > 0) {
1302 if (page
!= frag
->page
) {
1303 if (i
== MAX_SKB_FRAGS
) {
1308 skb_fill_page_desc(skb
, i
, page
, sk
->sk_sndmsg_off
, 0);
1309 frag
= &skb_shinfo(skb
)->frags
[i
];
1311 } else if(i
< MAX_SKB_FRAGS
) {
1312 if (copy
> PAGE_SIZE
)
1314 page
= alloc_pages(sk
->sk_allocation
, 0);
1319 sk
->sk_sndmsg_page
= page
;
1320 sk
->sk_sndmsg_off
= 0;
1322 skb_fill_page_desc(skb
, i
, page
, 0, 0);
1323 frag
= &skb_shinfo(skb
)->frags
[i
];
1328 if (getfrag(from
, page_address(frag
->page
)+frag
->page_offset
+frag
->size
, offset
, copy
, skb
->len
, skb
) < 0) {
1332 sk
->sk_sndmsg_off
+= copy
;
1335 skb
->data_len
+= copy
;
1336 skb
->truesize
+= copy
;
1337 atomic_add(copy
, &sk
->sk_wmem_alloc
);
1344 inet
->cork
.length
-= length
;
1345 IP6_INC_STATS(rt
->rt6i_idev
, IPSTATS_MIB_OUTDISCARDS
);
1349 int ip6_push_pending_frames(struct sock
*sk
)
1351 struct sk_buff
*skb
, *tmp_skb
;
1352 struct sk_buff
**tail_skb
;
1353 struct in6_addr final_dst_buf
, *final_dst
= &final_dst_buf
;
1354 struct inet_sock
*inet
= inet_sk(sk
);
1355 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1356 struct ipv6hdr
*hdr
;
1357 struct ipv6_txoptions
*opt
= np
->cork
.opt
;
1358 struct rt6_info
*rt
= np
->cork
.rt
;
1359 struct flowi
*fl
= &inet
->cork
.fl
;
1360 unsigned char proto
= fl
->proto
;
1363 if ((skb
= __skb_dequeue(&sk
->sk_write_queue
)) == NULL
)
1365 tail_skb
= &(skb_shinfo(skb
)->frag_list
);
1367 /* move skb->data to ip header from ext header */
1368 if (skb
->data
< skb_network_header(skb
))
1369 __skb_pull(skb
, skb_network_offset(skb
));
1370 while ((tmp_skb
= __skb_dequeue(&sk
->sk_write_queue
)) != NULL
) {
1371 __skb_pull(tmp_skb
, skb_network_header_len(skb
));
1372 *tail_skb
= tmp_skb
;
1373 tail_skb
= &(tmp_skb
->next
);
1374 skb
->len
+= tmp_skb
->len
;
1375 skb
->data_len
+= tmp_skb
->len
;
1376 skb
->truesize
+= tmp_skb
->truesize
;
1377 __sock_put(tmp_skb
->sk
);
1378 tmp_skb
->destructor
= NULL
;
1382 ipv6_addr_copy(final_dst
, &fl
->fl6_dst
);
1383 __skb_pull(skb
, skb_network_header_len(skb
));
1384 if (opt
&& opt
->opt_flen
)
1385 ipv6_push_frag_opts(skb
, opt
, &proto
);
1386 if (opt
&& opt
->opt_nflen
)
1387 ipv6_push_nfrag_opts(skb
, opt
, &proto
, &final_dst
);
1389 skb_push(skb
, sizeof(struct ipv6hdr
));
1390 skb_reset_network_header(skb
);
1391 hdr
= ipv6_hdr(skb
);
1393 *(__be32
*)hdr
= fl
->fl6_flowlabel
|
1394 htonl(0x60000000 | ((int)np
->cork
.tclass
<< 20));
1396 if (skb
->len
<= sizeof(struct ipv6hdr
) + IPV6_MAXPLEN
)
1397 hdr
->payload_len
= htons(skb
->len
- sizeof(struct ipv6hdr
));
1399 hdr
->payload_len
= 0;
1400 hdr
->hop_limit
= np
->cork
.hop_limit
;
1401 hdr
->nexthdr
= proto
;
1402 ipv6_addr_copy(&hdr
->saddr
, &fl
->fl6_src
);
1403 ipv6_addr_copy(&hdr
->daddr
, final_dst
);
1405 skb
->priority
= sk
->sk_priority
;
1407 skb
->dst
= dst_clone(&rt
->u
.dst
);
1408 IP6_INC_STATS(rt
->rt6i_idev
, IPSTATS_MIB_OUTREQUESTS
);
1409 err
= NF_HOOK(PF_INET6
, NF_IP6_LOCAL_OUT
, skb
, NULL
, skb
->dst
->dev
, dst_output
);
1412 err
= np
->recverr
? net_xmit_errno(err
) : 0;
1418 inet
->cork
.flags
&= ~IPCORK_OPT
;
1419 kfree(np
->cork
.opt
);
1420 np
->cork
.opt
= NULL
;
1422 dst_release(&np
->cork
.rt
->u
.dst
);
1424 inet
->cork
.flags
&= ~IPCORK_ALLFRAG
;
1426 memset(&inet
->cork
.fl
, 0, sizeof(inet
->cork
.fl
));
1432 void ip6_flush_pending_frames(struct sock
*sk
)
1434 struct inet_sock
*inet
= inet_sk(sk
);
1435 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1436 struct sk_buff
*skb
;
1438 while ((skb
= __skb_dequeue_tail(&sk
->sk_write_queue
)) != NULL
) {
1440 IP6_INC_STATS(ip6_dst_idev(skb
->dst
),
1441 IPSTATS_MIB_OUTDISCARDS
);
1445 inet
->cork
.flags
&= ~IPCORK_OPT
;
1447 kfree(np
->cork
.opt
);
1448 np
->cork
.opt
= NULL
;
1450 dst_release(&np
->cork
.rt
->u
.dst
);
1452 inet
->cork
.flags
&= ~IPCORK_ALLFRAG
;
1454 memset(&inet
->cork
.fl
, 0, sizeof(inet
->cork
.fl
));