2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * RAW - implementation of IP "raw" sockets.
8 * Version: $Id: raw.c,v 1.42 1999/07/02 11:26:26 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Alan Cox : verify_area() fixed up
15 * Alan Cox : ICMP error handling
16 * Alan Cox : EMSGSIZE if you send too big a packet
17 * Alan Cox : Now uses generic datagrams and shared skbuff
18 * library. No more peek crashes, no more backlogs
19 * Alan Cox : Checks sk->broadcast.
20 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
21 * Alan Cox : Raw passes ip options too
22 * Alan Cox : Setsocketopt added
23 * Alan Cox : Fixed error return for broadcasts
24 * Alan Cox : Removed wake_up calls
25 * Alan Cox : Use ttl/tos
26 * Alan Cox : Cleaned up old debugging
27 * Alan Cox : Use new kernel side addresses
28 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
29 * Alan Cox : BSD style RAW socket demultiplexing.
30 * Alan Cox : Beginnings of mrouted support.
31 * Alan Cox : Added IP_HDRINCL option.
32 * Alan Cox : Skip broadcast check if BSDism set.
33 * David S. Miller : New socket lookup architecture.
35 * This program is free software; you can redistribute it and/or
36 * modify it under the terms of the GNU General Public License
37 * as published by the Free Software Foundation; either version
38 * 2 of the License, or (at your option) any later version.
41 #include <linux/config.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
44 #include <linux/types.h>
45 #include <linux/sched.h>
46 #include <linux/errno.h>
47 #include <linux/timer.h>
49 #include <linux/kernel.h>
50 #include <linux/fcntl.h>
51 #include <linux/socket.h>
53 #include <linux/inet.h>
54 #include <linux/netdevice.h>
55 #include <linux/mroute.h>
57 #include <net/protocol.h>
58 #include <linux/skbuff.h>
63 #include <net/checksum.h>
65 #ifdef CONFIG_IP_MROUTE
66 struct sock
*mroute_socket
=NULL
;
69 struct sock
*raw_v4_htable
[RAWV4_HTABLE_SIZE
];
71 static void raw_v4_hash(struct sock
*sk
)
73 struct sock
**skp
= &raw_v4_htable
[sk
->num
& (RAWV4_HTABLE_SIZE
- 1)];
75 SOCKHASH_LOCK_WRITE();
76 if ((sk
->next
= *skp
) != NULL
)
77 (*skp
)->pprev
= &sk
->next
;
81 if(sk
->prot
->highestinuse
< sk
->prot
->inuse
)
82 sk
->prot
->highestinuse
= sk
->prot
->inuse
;
83 SOCKHASH_UNLOCK_WRITE();
86 static void raw_v4_unhash(struct sock
*sk
)
88 SOCKHASH_LOCK_WRITE();
91 sk
->next
->pprev
= sk
->pprev
;
92 *sk
->pprev
= sk
->next
;
96 SOCKHASH_UNLOCK_WRITE();
99 static __inline__
struct sock
*__raw_v4_lookup(struct sock
*sk
, unsigned short num
,
100 unsigned long raddr
, unsigned long laddr
,
105 for(s
= sk
; s
; s
= s
->next
) {
106 if((s
->num
== num
) &&
107 !(s
->dead
&& (s
->state
== TCP_CLOSE
)) &&
108 !(s
->daddr
&& s
->daddr
!= raddr
) &&
109 !(s
->rcv_saddr
&& s
->rcv_saddr
!= laddr
) &&
110 !(s
->bound_dev_if
&& s
->bound_dev_if
!= dif
))
116 struct sock
*raw_v4_lookup(struct sock
*sk
, unsigned short num
,
117 unsigned long raddr
, unsigned long laddr
,
120 SOCKHASH_LOCK_READ();
121 sk
= __raw_v4_lookup(sk
, num
, raddr
, laddr
, dif
);
122 SOCKHASH_UNLOCK_READ();
131 static __inline__
int icmp_filter(struct sock
*sk
, struct sk_buff
*skb
)
135 type
= skb
->h
.icmph
->type
;
137 return test_bit(type
, &sk
->tp_pinfo
.tp_raw4
.filter
);
139 /* Do not block unknown ICMP types */
143 /* IP input processing comes here for RAW socket delivery.
144 * This is fun as to avoid copies we want to make no surplus
147 * RFC 1122: SHOULD pass TOS value up to the transport layer.
148 * -> It does. And not only TOS, but all IP header.
150 struct sock
*raw_v4_input(struct sk_buff
*skb
, struct iphdr
*iph
, int hash
)
154 SOCKHASH_LOCK_READ_BH();
155 if ((sk
= raw_v4_htable
[hash
]) == NULL
)
157 sk
= __raw_v4_lookup(sk
, iph
->protocol
,
158 iph
->saddr
, iph
->daddr
,
161 struct sock
*sknext
= __raw_v4_lookup(sk
->next
, iph
->protocol
,
162 iph
->saddr
, iph
->daddr
,
165 if (iph
->protocol
!= IPPROTO_ICMP
||
166 ! icmp_filter(sk
, skb
)) {
167 struct sk_buff
*clone
;
171 clone
= skb_clone(skb
, GFP_ATOMIC
);
173 SOCKHASH_UNLOCK_READ_BH();
175 SOCKHASH_LOCK_READ_BH();
181 SOCKHASH_UNLOCK_READ_BH();
186 void raw_err (struct sock
*sk
, struct sk_buff
*skb
)
188 int type
= skb
->h
.icmph
->type
;
189 int code
= skb
->h
.icmph
->code
;
194 /* Report error on raw socket, if:
195 1. User requested ip_recverr.
196 2. Socket is connected (otherwise the error indication
197 is useless without ip_recverr and error is hard.
199 if (!sk
->ip_recverr
&& sk
->state
!= TCP_ESTABLISHED
)
204 case ICMP_TIME_EXCEEDED
:
207 case ICMP_SOURCE_QUENCH
:
209 case ICMP_PARAMETERPROB
:
211 info
= ntohl(skb
->h
.icmph
->un
.gateway
)>>24;
214 case ICMP_DEST_UNREACH
:
216 if (code
> NR_ICMP_UNREACH
)
218 err
= icmp_err_convert
[code
].errno
;
219 harderr
= icmp_err_convert
[code
].fatal
;
220 if (code
== ICMP_FRAG_NEEDED
) {
221 harderr
= (sk
->ip_pmtudisc
!= IP_PMTUDISC_DONT
);
223 info
= ntohs(skb
->h
.icmph
->un
.frag
.mtu
);
228 ip_icmp_error(sk
, skb
, err
, 0, info
, (u8
*)(skb
->h
.icmph
+ 1));
230 if (sk
->ip_recverr
|| harderr
) {
232 sk
->error_report(sk
);
236 static int raw_rcv_skb(struct sock
* sk
, struct sk_buff
* skb
)
238 /* Charge it to the socket. */
240 if (sock_queue_rcv_skb(sk
,skb
)<0)
242 ip_statistics
.IpInDiscards
++;
247 ip_statistics
.IpInDelivers
++;
252 * This should be the easiest of all, all we do is
253 * copy it into a buffer. All demultiplexing is done
257 int raw_rcv(struct sock
*sk
, struct sk_buff
*skb
)
259 /* Now we need to copy this into memory. */
260 skb_trim(skb
, ntohs(skb
->nh
.iph
->tot_len
));
262 skb
->h
.raw
= skb
->nh
.raw
;
264 raw_rcv_skb(sk
, skb
);
275 * Send a RAW IP packet.
279 * Callback support is trivial for SOCK_RAW
282 static int raw_getfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
284 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
285 return memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
);
289 * IPPROTO_RAW needs extra work.
292 static int raw_getrawfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
294 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
296 if (memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
))
300 struct iphdr
*iph
= (struct iphdr
*)to
;
302 iph
->saddr
= rfh
->saddr
;
304 iph
->tot_len
=htons(fraglen
); /* This is right as you can't frag
307 * Deliberate breach of modularity to keep
308 * ip_build_xmit clean (well less messy).
311 iph
->id
= htons(ip_id_count
++);
312 iph
->check
=ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
317 static int raw_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
319 struct ipcm_cookie ipc
;
320 struct rawfakehdr rfh
;
321 struct rtable
*rt
= NULL
;
327 /* This check is ONLY to check for arithmetic overflow
328 on integer(!) len. Not more! Real check will be made
329 in ip_build_xmit --ANK
331 BTW socket.c -> af_*.c -> ... make multiple
332 invalid conversions size_t -> int. We MUST repair it f.e.
333 by replacing all of them with size_t and revise all
334 the places sort of len += sizeof(struct iphdr)
335 If len was ULONG_MAX-10 it would be cathastrophe --ANK
338 if (len
< 0 || len
> 0xFFFF)
345 if (msg
->msg_flags
& MSG_OOB
) /* Mirror BSD error message compatibility */
348 if (msg
->msg_flags
& ~(MSG_DONTROUTE
|MSG_DONTWAIT
))
352 * Get and verify the address.
355 if (msg
->msg_namelen
) {
356 struct sockaddr_in
*usin
= (struct sockaddr_in
*)msg
->msg_name
;
357 if (msg
->msg_namelen
< sizeof(*usin
))
359 if (usin
->sin_family
!= AF_INET
) {
360 static int complained
;
362 printk(KERN_INFO
"%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current
->comm
);
363 if (usin
->sin_family
)
366 daddr
= usin
->sin_addr
.s_addr
;
367 /* ANK: I did not forget to get protocol from port field.
368 * I just do not know, who uses this weirdness.
369 * IP_HDRINCL is much more convenient.
372 if (sk
->state
!= TCP_ESTABLISHED
)
377 ipc
.addr
= sk
->saddr
;
379 ipc
.oif
= sk
->bound_dev_if
;
381 if (msg
->msg_controllen
) {
382 int tmp
= ip_cmsg_send(msg
, &ipc
);
389 rfh
.saddr
= ipc
.addr
;
397 /* Linux does not mangle headers on raw sockets,
398 * so that IP options + IP_HDRINCL is non-sense.
405 daddr
= ipc
.opt
->faddr
;
408 tos
= RT_TOS(sk
->ip_tos
) | sk
->localroute
;
409 if (msg
->msg_flags
&MSG_DONTROUTE
)
412 if (MULTICAST(daddr
)) {
414 ipc
.oif
= sk
->ip_mc_index
;
416 rfh
.saddr
= sk
->ip_mc_addr
;
419 err
= ip_route_output(&rt
, daddr
, rfh
.saddr
, tos
, ipc
.oif
);
425 if (rt
->rt_flags
&RTCF_BROADCAST
&& !sk
->broadcast
)
428 rfh
.iov
= msg
->msg_iov
;
429 rfh
.saddr
= rt
->rt_src
;
431 ipc
.addr
= rt
->rt_dst
;
432 err
=ip_build_xmit(sk
, sk
->ip_hdrincl
? raw_getrawfrag
: raw_getfrag
,
433 &rfh
, len
, &ipc
, rt
, msg
->msg_flags
);
440 return err
<0 ? err
: len
;
443 static void raw_close(struct sock
*sk
, long timeout
)
447 /* Observation: when raw_close is called, processes have
448 no access to socket anymore. But net still has.
449 Step one, detach it from networking:
451 A. Remove from hash tables.
453 sk
->state
= TCP_CLOSE
;
456 B. Raw sockets may have direct kernel refereneces. Kill them.
458 ip_ra_control(sk
, 0, NULL
);
460 /* In this point socket cannot receive new packets anymore */
463 /* But we still have packets pending on receive
464 queue and probably, our own packets waiting in device queues.
465 sock_destroy will drain receive queue, but transmitted
466 packets will delay socket destruction.
467 Set sk->dead=1 in order to prevent wakeups, when these
468 packet will be freed.
473 /* That's all. No races here. */
476 /* This gets rid of all the nasties in af_inet. -DaveM */
477 static int raw_bind(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
479 struct sockaddr_in
*addr
= (struct sockaddr_in
*) uaddr
;
482 if((sk
->state
!= TCP_CLOSE
) || (addr_len
< sizeof(struct sockaddr_in
)))
484 chk_addr_ret
= inet_addr_type(addr
->sin_addr
.s_addr
);
485 if(addr
->sin_addr
.s_addr
!= 0 && chk_addr_ret
!= RTN_LOCAL
&&
486 chk_addr_ret
!= RTN_MULTICAST
&& chk_addr_ret
!= RTN_BROADCAST
) {
487 #ifdef CONFIG_IP_TRANSPARENT_PROXY
488 /* Superuser may bind to any address to allow transparent proxying. */
489 if(chk_addr_ret
!= RTN_UNICAST
|| !capable(CAP_NET_ADMIN
))
491 return -EADDRNOTAVAIL
;
493 sk
->rcv_saddr
= sk
->saddr
= addr
->sin_addr
.s_addr
;
494 if(chk_addr_ret
== RTN_MULTICAST
|| chk_addr_ret
== RTN_BROADCAST
)
495 sk
->saddr
= 0; /* Use device */
496 dst_release(xchg(&sk
->dst_cache
, NULL
));
501 * This should be easy, if there is something there
502 * we return it, otherwise we block.
505 int raw_recvmsg(struct sock
*sk
, struct msghdr
*msg
, int len
,
506 int noblock
, int flags
,int *addr_len
)
511 struct sockaddr_in
*sin
=(struct sockaddr_in
*)msg
->msg_name
;
517 *addr_len
=sizeof(*sin
);
519 if (flags
& MSG_ERRQUEUE
)
520 return ip_recv_error(sk
, msg
, len
);
522 skb
=skb_recv_datagram(sk
,flags
,noblock
,&err
);
529 msg
->msg_flags
|= MSG_TRUNC
;
533 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, copied
);
537 sk
->stamp
=skb
->stamp
;
539 /* Copy the address. */
541 sin
->sin_family
= AF_INET
;
542 sin
->sin_addr
.s_addr
= skb
->nh
.iph
->saddr
;
544 if (sk
->ip_cmsg_flags
)
545 ip_cmsg_recv(msg
, skb
);
547 skb_free_datagram(sk
, skb
);
548 return (err
? : copied
);
551 static int raw_init(struct sock
*sk
)
553 struct raw_opt
*tp
= &(sk
->tp_pinfo
.tp_raw4
);
554 if (sk
->num
== IPPROTO_ICMP
)
555 memset(&tp
->filter
, 0, sizeof(tp
->filter
));
559 static int raw_seticmpfilter(struct sock
*sk
, char *optval
, int optlen
)
561 if (optlen
> sizeof(struct icmp_filter
))
562 optlen
= sizeof(struct icmp_filter
);
563 if (copy_from_user(&sk
->tp_pinfo
.tp_raw4
.filter
, optval
, optlen
))
568 static int raw_geticmpfilter(struct sock
*sk
, char *optval
, int *optlen
)
572 if (get_user(len
,optlen
))
574 if (len
> sizeof(struct icmp_filter
))
575 len
= sizeof(struct icmp_filter
);
576 if (put_user(len
, optlen
))
578 if (copy_to_user(optval
, &sk
->tp_pinfo
.tp_raw4
.filter
, len
))
583 static int raw_setsockopt(struct sock
*sk
, int level
, int optname
,
584 char *optval
, int optlen
)
586 if (level
!= SOL_RAW
)
587 return ip_setsockopt(sk
, level
, optname
, optval
, optlen
);
591 if (sk
->num
!= IPPROTO_ICMP
)
593 return raw_seticmpfilter(sk
, optval
, optlen
);
599 static int raw_getsockopt(struct sock
*sk
, int level
, int optname
,
600 char *optval
, int *optlen
)
602 if (level
!= SOL_RAW
)
603 return ip_getsockopt(sk
, level
, optname
, optval
, optlen
);
607 if (sk
->num
!= IPPROTO_ICMP
)
609 return raw_geticmpfilter(sk
, optval
, optlen
);
615 static void get_raw_sock(struct sock
*sp
, char *tmpbuf
, int i
)
617 unsigned int dest
, src
;
620 unsigned long timer_expires
;
624 destp
= ntohs(sp
->dport
);
625 srcp
= ntohs(sp
->sport
);
626 timer_active
= (sp
->timer
.prev
!= NULL
) ? 2 : 0;
627 timer_expires
= (timer_active
== 2 ? sp
->timer
.expires
: jiffies
);
628 sprintf(tmpbuf
, "%4d: %08X:%04X %08X:%04X"
629 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
630 i
, src
, srcp
, dest
, destp
, sp
->state
,
631 atomic_read(&sp
->wmem_alloc
), atomic_read(&sp
->rmem_alloc
),
632 timer_active
, timer_expires
-jiffies
, 0,
633 sp
->socket
->inode
->i_uid
, timer_active
? sp
->timeout
: 0,
634 sp
->socket
? sp
->socket
->inode
->i_ino
: 0);
637 int raw_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
639 int len
= 0, num
= 0, i
;
645 len
+= sprintf(buffer
, "%-127s\n",
646 " sl local_address rem_address st tx_queue "
647 "rx_queue tr tm->when retrnsmt uid timeout inode");
649 SOCKHASH_LOCK_READ();
650 for (i
= 0; i
< RAWV4_HTABLE_SIZE
; i
++) {
653 for (sk
= raw_v4_htable
[i
]; sk
; sk
= sk
->next
, num
++) {
654 if (sk
->family
!= PF_INET
)
659 get_raw_sock(sk
, tmpbuf
, i
);
660 len
+= sprintf(buffer
+len
, "%-127s\n", tmpbuf
);
666 SOCKHASH_UNLOCK_READ();
667 begin
= len
- (pos
- offset
);
668 *start
= buffer
+ begin
;
677 struct proto raw_prot
= {
678 raw_close
, /* close */
679 udp_connect
, /* connect */
681 NULL
, /* retransmit */
682 NULL
, /* write_wakeup */
683 NULL
, /* read_wakeup */
684 datagram_poll
, /* poll */
685 #ifdef CONFIG_IP_MROUTE
686 ipmr_ioctl
, /* ioctl */
693 raw_setsockopt
, /* setsockopt */
694 raw_getsockopt
, /* getsockopt */
695 raw_sendmsg
, /* sendmsg */
696 raw_recvmsg
, /* recvmsg */
698 raw_rcv_skb
, /* backlog_rcv */
699 raw_v4_hash
, /* hash */
700 raw_v4_unhash
, /* unhash */
702 128, /* max_header */