2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * RAW - implementation of IP "raw" sockets.
8 * Version: $Id: raw.c,v 1.41 1999/05/30 01:16:19 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Alan Cox : verify_area() fixed up
15 * Alan Cox : ICMP error handling
16 * Alan Cox : EMSGSIZE if you send too big a packet
17 * Alan Cox : Now uses generic datagrams and shared skbuff
18 * library. No more peek crashes, no more backlogs
19 * Alan Cox : Checks sk->broadcast.
20 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
21 * Alan Cox : Raw passes ip options too
22 * Alan Cox : Setsocketopt added
23 * Alan Cox : Fixed error return for broadcasts
24 * Alan Cox : Removed wake_up calls
25 * Alan Cox : Use ttl/tos
26 * Alan Cox : Cleaned up old debugging
27 * Alan Cox : Use new kernel side addresses
28 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
29 * Alan Cox : BSD style RAW socket demultiplexing.
30 * Alan Cox : Beginnings of mrouted support.
31 * Alan Cox : Added IP_HDRINCL option.
32 * Alan Cox : Skip broadcast check if BSDism set.
33 * David S. Miller : New socket lookup architecture.
35 * This program is free software; you can redistribute it and/or
36 * modify it under the terms of the GNU General Public License
37 * as published by the Free Software Foundation; either version
38 * 2 of the License, or (at your option) any later version.
41 #include <linux/config.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
44 #include <linux/types.h>
45 #include <linux/sched.h>
46 #include <linux/errno.h>
47 #include <linux/timer.h>
49 #include <linux/kernel.h>
50 #include <linux/fcntl.h>
51 #include <linux/socket.h>
53 #include <linux/inet.h>
54 #include <linux/netdevice.h>
55 #include <linux/mroute.h>
57 #include <net/protocol.h>
58 #include <linux/skbuff.h>
63 #include <net/checksum.h>
65 #ifdef CONFIG_IP_MROUTE
66 struct sock
*mroute_socket
=NULL
;
69 struct sock
*raw_v4_htable
[RAWV4_HTABLE_SIZE
];
71 static void raw_v4_hash(struct sock
*sk
)
76 num
&= (RAWV4_HTABLE_SIZE
- 1);
77 skp
= &raw_v4_htable
[num
];
78 SOCKHASH_LOCK_WRITE();
82 SOCKHASH_UNLOCK_WRITE();
85 static void raw_v4_unhash(struct sock
*sk
)
90 num
&= (RAWV4_HTABLE_SIZE
- 1);
91 skp
= &raw_v4_htable
[num
];
93 SOCKHASH_LOCK_WRITE();
99 skp
= &((*skp
)->next
);
101 SOCKHASH_UNLOCK_WRITE();
104 static void raw_v4_rehash(struct sock
*sk
)
108 int oldnum
= sk
->hashent
;
110 num
&= (RAWV4_HTABLE_SIZE
- 1);
111 skp
= &raw_v4_htable
[oldnum
];
113 SOCKHASH_LOCK_WRITE();
114 while(*skp
!= NULL
) {
119 skp
= &((*skp
)->next
);
121 sk
->next
= raw_v4_htable
[num
];
122 raw_v4_htable
[num
] = sk
;
124 SOCKHASH_UNLOCK_WRITE();
127 static __inline__
struct sock
*__raw_v4_lookup(struct sock
*sk
, unsigned short num
,
128 unsigned long raddr
, unsigned long laddr
,
133 for(s
= sk
; s
; s
= s
->next
) {
134 if((s
->num
== num
) &&
135 !(s
->dead
&& (s
->state
== TCP_CLOSE
)) &&
136 !(s
->daddr
&& s
->daddr
!= raddr
) &&
137 !(s
->rcv_saddr
&& s
->rcv_saddr
!= laddr
) &&
138 !(s
->bound_dev_if
&& s
->bound_dev_if
!= dif
))
144 struct sock
*raw_v4_lookup(struct sock
*sk
, unsigned short num
,
145 unsigned long raddr
, unsigned long laddr
,
148 SOCKHASH_LOCK_READ();
149 sk
= __raw_v4_lookup(sk
, num
, raddr
, laddr
, dif
);
150 SOCKHASH_UNLOCK_READ();
159 static __inline__
int icmp_filter(struct sock
*sk
, struct sk_buff
*skb
)
163 type
= skb
->h
.icmph
->type
;
165 return test_bit(type
, &sk
->tp_pinfo
.tp_raw4
.filter
);
167 /* Do not block unknown ICMP types */
171 /* IP input processing comes here for RAW socket delivery.
172 * This is fun as to avoid copies we want to make no surplus
175 * RFC 1122: SHOULD pass TOS value up to the transport layer.
176 * -> It does. And not only TOS, but all IP header.
178 struct sock
*raw_v4_input(struct sk_buff
*skb
, struct iphdr
*iph
, int hash
)
182 SOCKHASH_LOCK_READ_BH();
183 if ((sk
= raw_v4_htable
[hash
]) == NULL
)
185 sk
= __raw_v4_lookup(sk
, iph
->protocol
,
186 iph
->saddr
, iph
->daddr
,
189 struct sock
*sknext
= __raw_v4_lookup(sk
->next
, iph
->protocol
,
190 iph
->saddr
, iph
->daddr
,
193 if (iph
->protocol
!= IPPROTO_ICMP
||
194 ! icmp_filter(sk
, skb
)) {
195 struct sk_buff
*clone
;
199 clone
= skb_clone(skb
, GFP_ATOMIC
);
201 SOCKHASH_UNLOCK_READ_BH();
203 SOCKHASH_LOCK_READ_BH();
209 SOCKHASH_UNLOCK_READ_BH();
214 void raw_err (struct sock
*sk
, struct sk_buff
*skb
)
216 int type
= skb
->h
.icmph
->type
;
217 int code
= skb
->h
.icmph
->code
;
222 /* Report error on raw socket, if:
223 1. User requested ip_recverr.
224 2. Socket is connected (otherwise the error indication
225 is useless without ip_recverr and error is hard.
227 if (!sk
->ip_recverr
&& sk
->state
!= TCP_ESTABLISHED
)
232 case ICMP_TIME_EXCEEDED
:
235 case ICMP_SOURCE_QUENCH
:
237 case ICMP_PARAMETERPROB
:
239 info
= ntohl(skb
->h
.icmph
->un
.gateway
)>>24;
242 case ICMP_DEST_UNREACH
:
244 if (code
> NR_ICMP_UNREACH
)
246 err
= icmp_err_convert
[code
].errno
;
247 harderr
= icmp_err_convert
[code
].fatal
;
248 if (code
== ICMP_FRAG_NEEDED
) {
249 harderr
= (sk
->ip_pmtudisc
!= IP_PMTUDISC_DONT
);
251 info
= ntohs(skb
->h
.icmph
->un
.frag
.mtu
);
256 ip_icmp_error(sk
, skb
, err
, 0, info
, (u8
*)(skb
->h
.icmph
+ 1));
258 if (sk
->ip_recverr
|| harderr
) {
260 sk
->error_report(sk
);
264 static int raw_rcv_skb(struct sock
* sk
, struct sk_buff
* skb
)
266 /* Charge it to the socket. */
268 if (sock_queue_rcv_skb(sk
,skb
)<0)
270 ip_statistics
.IpInDiscards
++;
275 ip_statistics
.IpInDelivers
++;
280 * This should be the easiest of all, all we do is
281 * copy it into a buffer. All demultiplexing is done
285 int raw_rcv(struct sock
*sk
, struct sk_buff
*skb
)
287 /* Now we need to copy this into memory. */
288 skb_trim(skb
, ntohs(skb
->nh
.iph
->tot_len
));
290 skb
->h
.raw
= skb
->nh
.raw
;
292 raw_rcv_skb(sk
, skb
);
303 * Send a RAW IP packet.
307 * Callback support is trivial for SOCK_RAW
310 static int raw_getfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
312 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
313 return memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
);
317 * IPPROTO_RAW needs extra work.
320 static int raw_getrawfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
322 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
324 if (memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
))
328 struct iphdr
*iph
= (struct iphdr
*)to
;
330 iph
->saddr
= rfh
->saddr
;
332 iph
->tot_len
=htons(fraglen
); /* This is right as you can't frag
335 * Deliberate breach of modularity to keep
336 * ip_build_xmit clean (well less messy).
339 iph
->id
= htons(ip_id_count
++);
340 iph
->check
=ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
345 static int raw_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
347 struct ipcm_cookie ipc
;
348 struct rawfakehdr rfh
;
349 struct rtable
*rt
= NULL
;
355 /* This check is ONLY to check for arithmetic overflow
356 on integer(!) len. Not more! Real check will be made
357 in ip_build_xmit --ANK
359 BTW socket.c -> af_*.c -> ... make multiple
360 invalid conversions size_t -> int. We MUST repair it f.e.
361 by replacing all of them with size_t and revise all
362 the places sort of len += sizeof(struct iphdr)
363 If len was ULONG_MAX-10 it would be cathastrophe --ANK
366 if (len
< 0 || len
> 0xFFFF)
373 if (msg
->msg_flags
& MSG_OOB
) /* Mirror BSD error message compatibility */
376 if (msg
->msg_flags
& ~(MSG_DONTROUTE
|MSG_DONTWAIT
))
380 * Get and verify the address.
383 if (msg
->msg_namelen
) {
384 struct sockaddr_in
*usin
= (struct sockaddr_in
*)msg
->msg_name
;
385 if (msg
->msg_namelen
< sizeof(*usin
))
387 if (usin
->sin_family
!= AF_INET
) {
388 static int complained
;
390 printk(KERN_INFO
"%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current
->comm
);
391 if (usin
->sin_family
)
394 daddr
= usin
->sin_addr
.s_addr
;
395 /* ANK: I did not forget to get protocol from port field.
396 * I just do not know, who uses this weirdness.
397 * IP_HDRINCL is much more convenient.
400 if (sk
->state
!= TCP_ESTABLISHED
)
405 ipc
.addr
= sk
->saddr
;
407 ipc
.oif
= sk
->bound_dev_if
;
409 if (msg
->msg_controllen
) {
410 int tmp
= ip_cmsg_send(msg
, &ipc
);
417 rfh
.saddr
= ipc
.addr
;
425 /* Linux does not mangle headers on raw sockets,
426 * so that IP options + IP_HDRINCL is non-sense.
433 daddr
= ipc
.opt
->faddr
;
436 tos
= RT_TOS(sk
->ip_tos
) | sk
->localroute
;
437 if (msg
->msg_flags
&MSG_DONTROUTE
)
440 if (MULTICAST(daddr
)) {
442 ipc
.oif
= sk
->ip_mc_index
;
444 rfh
.saddr
= sk
->ip_mc_addr
;
447 err
= ip_route_output(&rt
, daddr
, rfh
.saddr
, tos
, ipc
.oif
);
453 if (rt
->rt_flags
&RTCF_BROADCAST
&& !sk
->broadcast
)
456 rfh
.iov
= msg
->msg_iov
;
457 rfh
.saddr
= rt
->rt_src
;
459 ipc
.addr
= rt
->rt_dst
;
460 err
=ip_build_xmit(sk
, sk
->ip_hdrincl
? raw_getrawfrag
: raw_getfrag
,
461 &rfh
, len
, &ipc
, rt
, msg
->msg_flags
);
468 return err
<0 ? err
: len
;
471 static void raw_close(struct sock
*sk
, long timeout
)
475 /* Observation: when raw_close is called, processes have
476 no access to socket anymore. But net still has.
477 Step one, detach it from networking:
479 A. Remove from hash tables.
481 sk
->state
= TCP_CLOSE
;
484 B. Raw sockets may have direct kernel refereneces. Kill them.
486 ip_ra_control(sk
, 0, NULL
);
488 /* In this point socket cannot receive new packets anymore */
491 /* But we still have packets pending on receive
492 queue and probably, our own packets waiting in device queues.
493 sock_destroy will drain receive queue, but transmitted
494 packets will delay socket destruction.
495 Set sk->dead=1 in order to prevent wakeups, when these
496 packet will be freed.
501 /* That's all. No races here. */
504 /* This gets rid of all the nasties in af_inet. -DaveM */
505 static int raw_bind(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
507 struct sockaddr_in
*addr
= (struct sockaddr_in
*) uaddr
;
510 if((sk
->state
!= TCP_CLOSE
) || (addr_len
< sizeof(struct sockaddr_in
)))
512 chk_addr_ret
= inet_addr_type(addr
->sin_addr
.s_addr
);
513 if(addr
->sin_addr
.s_addr
!= 0 && chk_addr_ret
!= RTN_LOCAL
&&
514 chk_addr_ret
!= RTN_MULTICAST
&& chk_addr_ret
!= RTN_BROADCAST
) {
515 #ifdef CONFIG_IP_TRANSPARENT_PROXY
516 /* Superuser may bind to any address to allow transparent proxying. */
517 if(chk_addr_ret
!= RTN_UNICAST
|| !capable(CAP_NET_ADMIN
))
519 return -EADDRNOTAVAIL
;
521 sk
->rcv_saddr
= sk
->saddr
= addr
->sin_addr
.s_addr
;
522 if(chk_addr_ret
== RTN_MULTICAST
|| chk_addr_ret
== RTN_BROADCAST
)
523 sk
->saddr
= 0; /* Use device */
524 dst_release(xchg(&sk
->dst_cache
, NULL
));
529 * This should be easy, if there is something there
530 * we return it, otherwise we block.
533 int raw_recvmsg(struct sock
*sk
, struct msghdr
*msg
, int len
,
534 int noblock
, int flags
,int *addr_len
)
539 struct sockaddr_in
*sin
=(struct sockaddr_in
*)msg
->msg_name
;
545 *addr_len
=sizeof(*sin
);
547 if (flags
& MSG_ERRQUEUE
)
548 return ip_recv_error(sk
, msg
, len
);
550 skb
=skb_recv_datagram(sk
,flags
,noblock
,&err
);
557 msg
->msg_flags
|= MSG_TRUNC
;
561 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, copied
);
565 sk
->stamp
=skb
->stamp
;
567 /* Copy the address. */
569 sin
->sin_family
= AF_INET
;
570 sin
->sin_addr
.s_addr
= skb
->nh
.iph
->saddr
;
572 if (sk
->ip_cmsg_flags
)
573 ip_cmsg_recv(msg
, skb
);
575 skb_free_datagram(sk
, skb
);
576 return (err
? : copied
);
579 static int raw_init(struct sock
*sk
)
581 struct raw_opt
*tp
= &(sk
->tp_pinfo
.tp_raw4
);
582 if (sk
->num
== IPPROTO_ICMP
)
583 memset(&tp
->filter
, 0, sizeof(tp
->filter
));
587 static int raw_seticmpfilter(struct sock
*sk
, char *optval
, int optlen
)
589 if (optlen
> sizeof(struct icmp_filter
))
590 optlen
= sizeof(struct icmp_filter
);
591 if (copy_from_user(&sk
->tp_pinfo
.tp_raw4
.filter
, optval
, optlen
))
596 static int raw_geticmpfilter(struct sock
*sk
, char *optval
, int *optlen
)
600 if (get_user(len
,optlen
))
602 if (len
> sizeof(struct icmp_filter
))
603 len
= sizeof(struct icmp_filter
);
604 if (put_user(len
, optlen
))
606 if (copy_to_user(optval
, &sk
->tp_pinfo
.tp_raw4
.filter
, len
))
611 static int raw_setsockopt(struct sock
*sk
, int level
, int optname
,
612 char *optval
, int optlen
)
614 if (level
!= SOL_RAW
)
615 return ip_setsockopt(sk
, level
, optname
, optval
, optlen
);
619 if (sk
->num
!= IPPROTO_ICMP
)
621 return raw_seticmpfilter(sk
, optval
, optlen
);
627 static int raw_getsockopt(struct sock
*sk
, int level
, int optname
,
628 char *optval
, int *optlen
)
630 if (level
!= SOL_RAW
)
631 return ip_getsockopt(sk
, level
, optname
, optval
, optlen
);
635 if (sk
->num
!= IPPROTO_ICMP
)
637 return raw_geticmpfilter(sk
, optval
, optlen
);
643 struct proto raw_prot
= {
644 (struct sock
*)&raw_prot
, /* sklist_next */
645 (struct sock
*)&raw_prot
, /* sklist_prev */
646 raw_close
, /* close */
647 udp_connect
, /* connect */
649 NULL
, /* retransmit */
650 NULL
, /* write_wakeup */
651 NULL
, /* read_wakeup */
652 datagram_poll
, /* poll */
653 #ifdef CONFIG_IP_MROUTE
654 ipmr_ioctl
, /* ioctl */
661 raw_setsockopt
, /* setsockopt */
662 raw_getsockopt
, /* getsockopt */
663 raw_sendmsg
, /* sendmsg */
664 raw_recvmsg
, /* recvmsg */
666 raw_rcv_skb
, /* backlog_rcv */
667 raw_v4_hash
, /* hash */
668 raw_v4_unhash
, /* unhash */
669 raw_v4_rehash
, /* rehash */
670 NULL
, /* good_socknum */
671 NULL
, /* verify_bind */
672 128, /* max_header */