2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * RAW - implementation of IP "raw" sockets.
8 * Version: $Id: raw.c,v 1.39 1998/11/08 11:17:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Alan Cox : verify_area() fixed up
15 * Alan Cox : ICMP error handling
16 * Alan Cox : EMSGSIZE if you send too big a packet
17 * Alan Cox : Now uses generic datagrams and shared skbuff
18 * library. No more peek crashes, no more backlogs
19 * Alan Cox : Checks sk->broadcast.
20 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
21 * Alan Cox : Raw passes ip options too
22 * Alan Cox : Setsocketopt added
23 * Alan Cox : Fixed error return for broadcasts
24 * Alan Cox : Removed wake_up calls
25 * Alan Cox : Use ttl/tos
26 * Alan Cox : Cleaned up old debugging
27 * Alan Cox : Use new kernel side addresses
28 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
29 * Alan Cox : BSD style RAW socket demultiplexing.
30 * Alan Cox : Beginnings of mrouted support.
31 * Alan Cox : Added IP_HDRINCL option.
32 * Alan Cox : Skip broadcast check if BSDism set.
33 * David S. Miller : New socket lookup architecture.
35 * This program is free software; you can redistribute it and/or
36 * modify it under the terms of the GNU General Public License
37 * as published by the Free Software Foundation; either version
38 * 2 of the License, or (at your option) any later version.
41 #include <linux/config.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
44 #include <linux/types.h>
45 #include <linux/sched.h>
46 #include <linux/errno.h>
47 #include <linux/timer.h>
49 #include <linux/kernel.h>
50 #include <linux/fcntl.h>
51 #include <linux/socket.h>
53 #include <linux/inet.h>
54 #include <linux/netdevice.h>
55 #include <linux/mroute.h>
57 #include <net/protocol.h>
58 #include <linux/skbuff.h>
63 #include <net/checksum.h>
65 #ifdef CONFIG_IP_MROUTE
66 struct sock
*mroute_socket
=NULL
;
69 struct sock
*raw_v4_htable
[RAWV4_HTABLE_SIZE
];
71 static void raw_v4_hash(struct sock
*sk
)
76 num
&= (RAWV4_HTABLE_SIZE
- 1);
77 skp
= &raw_v4_htable
[num
];
78 SOCKHASH_LOCK_WRITE();
82 SOCKHASH_UNLOCK_WRITE();
85 static void raw_v4_unhash(struct sock
*sk
)
90 num
&= (RAWV4_HTABLE_SIZE
- 1);
91 skp
= &raw_v4_htable
[num
];
93 SOCKHASH_LOCK_WRITE();
99 skp
= &((*skp
)->next
);
101 SOCKHASH_UNLOCK_WRITE();
104 static void raw_v4_rehash(struct sock
*sk
)
108 int oldnum
= sk
->hashent
;
110 num
&= (RAWV4_HTABLE_SIZE
- 1);
111 skp
= &raw_v4_htable
[oldnum
];
113 SOCKHASH_LOCK_WRITE();
114 while(*skp
!= NULL
) {
119 skp
= &((*skp
)->next
);
121 sk
->next
= raw_v4_htable
[num
];
122 raw_v4_htable
[num
] = sk
;
124 SOCKHASH_UNLOCK_WRITE();
127 /* Grumble... icmp and ip_input want to get at this... */
128 struct sock
*raw_v4_lookup(struct sock
*sk
, unsigned short num
,
129 unsigned long raddr
, unsigned long laddr
, int dif
)
133 SOCKHASH_LOCK_READ();
134 for(s
= sk
; s
; s
= s
->next
) {
135 if((s
->num
== num
) &&
136 !(s
->dead
&& (s
->state
== TCP_CLOSE
)) &&
137 !(s
->daddr
&& s
->daddr
!= raddr
) &&
138 !(s
->rcv_saddr
&& s
->rcv_saddr
!= laddr
) &&
139 !(s
->bound_dev_if
&& s
->bound_dev_if
!= dif
))
142 SOCKHASH_UNLOCK_READ();
146 void raw_err (struct sock
*sk
, struct sk_buff
*skb
)
148 int type
= skb
->h
.icmph
->type
;
149 int code
= skb
->h
.icmph
->code
;
154 /* Report error on raw socket, if:
155 1. User requested ip_recverr.
156 2. Socket is connected (otherwise the error indication
157 is useless without ip_recverr and error is hard.
159 if (!sk
->ip_recverr
&& sk
->state
!= TCP_ESTABLISHED
)
164 case ICMP_TIME_EXCEEDED
:
167 case ICMP_SOURCE_QUENCH
:
169 case ICMP_PARAMETERPROB
:
171 info
= ntohl(skb
->h
.icmph
->un
.gateway
)>>24;
174 case ICMP_DEST_UNREACH
:
176 if (code
> NR_ICMP_UNREACH
)
178 err
= icmp_err_convert
[code
].errno
;
179 harderr
= icmp_err_convert
[code
].fatal
;
180 if (code
== ICMP_FRAG_NEEDED
) {
181 harderr
= (sk
->ip_pmtudisc
!= IP_PMTUDISC_DONT
);
183 info
= ntohs(skb
->h
.icmph
->un
.frag
.mtu
);
188 ip_icmp_error(sk
, skb
, err
, 0, info
, (u8
*)(skb
->h
.icmph
+ 1));
190 if (sk
->ip_recverr
|| harderr
) {
192 sk
->error_report(sk
);
196 static int raw_rcv_skb(struct sock
* sk
, struct sk_buff
* skb
)
198 /* Charge it to the socket. */
200 if (sock_queue_rcv_skb(sk
,skb
)<0)
202 ip_statistics
.IpInDiscards
++;
207 ip_statistics
.IpInDelivers
++;
212 * This should be the easiest of all, all we do is
213 * copy it into a buffer. All demultiplexing is done
217 int raw_rcv(struct sock
*sk
, struct sk_buff
*skb
)
219 /* Now we need to copy this into memory. */
220 skb_trim(skb
, ntohs(skb
->nh
.iph
->tot_len
));
222 skb
->h
.raw
= skb
->nh
.raw
;
224 raw_rcv_skb(sk
, skb
);
235 * Send a RAW IP packet.
239 * Callback support is trivial for SOCK_RAW
242 static int raw_getfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
244 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
245 return memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
);
249 * IPPROTO_RAW needs extra work.
252 static int raw_getrawfrag(const void *p
, char *to
, unsigned int offset
, unsigned int fraglen
)
254 struct rawfakehdr
*rfh
= (struct rawfakehdr
*) p
;
256 if (memcpy_fromiovecend(to
, rfh
->iov
, offset
, fraglen
))
260 struct iphdr
*iph
= (struct iphdr
*)to
;
262 iph
->saddr
= rfh
->saddr
;
264 iph
->tot_len
=htons(fraglen
); /* This is right as you can't frag
267 * Deliberate breach of modularity to keep
268 * ip_build_xmit clean (well less messy).
271 iph
->id
= htons(ip_id_count
++);
272 iph
->check
=ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
277 static int raw_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int len
)
279 struct ipcm_cookie ipc
;
280 struct rawfakehdr rfh
;
281 struct rtable
*rt
= NULL
;
287 /* This check is ONLY to check for arithmetic overflow
288 on integer(!) len. Not more! Real check will be made
289 in ip_build_xmit --ANK
291 BTW socket.c -> af_*.c -> ... make multiple
292 invalid conversions size_t -> int. We MUST repair it f.e.
293 by replacing all of them with size_t and revise all
294 the places sort of len += sizeof(struct iphdr)
295 If len was ULONG_MAX-10 it would be cathastrophe --ANK
298 if (len
< 0 || len
> 0xFFFF)
305 if (msg
->msg_flags
& MSG_OOB
) /* Mirror BSD error message compatibility */
308 if (msg
->msg_flags
& ~(MSG_DONTROUTE
|MSG_DONTWAIT
))
312 * Get and verify the address.
315 if (msg
->msg_namelen
) {
316 struct sockaddr_in
*usin
= (struct sockaddr_in
*)msg
->msg_name
;
317 if (msg
->msg_namelen
< sizeof(*usin
))
319 if (usin
->sin_family
!= AF_INET
) {
320 static int complained
;
322 printk(KERN_INFO
"%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current
->comm
);
323 if (usin
->sin_family
)
326 daddr
= usin
->sin_addr
.s_addr
;
327 /* ANK: I did not forget to get protocol from port field.
328 * I just do not know, who uses this weirdness.
329 * IP_HDRINCL is much more convenient.
332 if (sk
->state
!= TCP_ESTABLISHED
)
337 ipc
.addr
= sk
->saddr
;
339 ipc
.oif
= sk
->bound_dev_if
;
341 if (msg
->msg_controllen
) {
342 int tmp
= ip_cmsg_send(msg
, &ipc
);
349 rfh
.saddr
= ipc
.addr
;
357 /* Linux does not mangle headers on raw sockets,
358 * so that IP options + IP_HDRINCL is non-sense.
365 daddr
= ipc
.opt
->faddr
;
368 tos
= RT_TOS(sk
->ip_tos
) | sk
->localroute
;
369 if (msg
->msg_flags
&MSG_DONTROUTE
)
372 if (MULTICAST(daddr
)) {
374 ipc
.oif
= sk
->ip_mc_index
;
376 rfh
.saddr
= sk
->ip_mc_addr
;
379 err
= ip_route_output(&rt
, daddr
, rfh
.saddr
, tos
, ipc
.oif
);
385 if (rt
->rt_flags
&RTCF_BROADCAST
&& !sk
->broadcast
)
388 rfh
.iov
= msg
->msg_iov
;
389 rfh
.saddr
= rt
->rt_src
;
391 ipc
.addr
= rt
->rt_dst
;
392 err
=ip_build_xmit(sk
, sk
->ip_hdrincl
? raw_getrawfrag
: raw_getfrag
,
393 &rfh
, len
, &ipc
, rt
, msg
->msg_flags
);
400 return err
<0 ? err
: len
;
403 static void raw_close(struct sock
*sk
, long timeout
)
407 /* Observation: when raw_close is called, processes have
408 no access to socket anymore. But net still has.
409 Step one, detach it from networking:
411 A. Remove from hash tables.
413 sk
->state
= TCP_CLOSE
;
416 B. Raw sockets may have direct kernel refereneces. Kill them.
418 ip_ra_control(sk
, 0, NULL
);
420 /* In this point socket cannot receive new packets anymore */
423 /* But we still have packets pending on receive
424 queue and probably, our own packets waiting in device queues.
425 sock_destroy will drain receive queue, but transmitted
426 packets will delay socket destruction.
427 Set sk->dead=1 in order to prevent wakeups, when these
428 packet will be freed.
433 /* That's all. No races here. */
436 /* This gets rid of all the nasties in af_inet. -DaveM */
437 static int raw_bind(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
439 struct sockaddr_in
*addr
= (struct sockaddr_in
*) uaddr
;
442 if((sk
->state
!= TCP_CLOSE
) || (addr_len
< sizeof(struct sockaddr_in
)))
444 chk_addr_ret
= inet_addr_type(addr
->sin_addr
.s_addr
);
445 if(addr
->sin_addr
.s_addr
!= 0 && chk_addr_ret
!= RTN_LOCAL
&&
446 chk_addr_ret
!= RTN_MULTICAST
&& chk_addr_ret
!= RTN_BROADCAST
) {
447 #ifdef CONFIG_IP_TRANSPARENT_PROXY
448 /* Superuser may bind to any address to allow transparent proxying. */
449 if(chk_addr_ret
!= RTN_UNICAST
|| !capable(CAP_NET_ADMIN
))
451 return -EADDRNOTAVAIL
;
453 sk
->rcv_saddr
= sk
->saddr
= addr
->sin_addr
.s_addr
;
454 if(chk_addr_ret
== RTN_MULTICAST
|| chk_addr_ret
== RTN_BROADCAST
)
455 sk
->saddr
= 0; /* Use device */
456 dst_release(xchg(&sk
->dst_cache
, NULL
));
461 * This should be easy, if there is something there
462 * we return it, otherwise we block.
465 int raw_recvmsg(struct sock
*sk
, struct msghdr
*msg
, int len
,
466 int noblock
, int flags
,int *addr_len
)
471 struct sockaddr_in
*sin
=(struct sockaddr_in
*)msg
->msg_name
;
477 *addr_len
=sizeof(*sin
);
479 if (flags
& MSG_ERRQUEUE
)
480 return ip_recv_error(sk
, msg
, len
);
482 skb
=skb_recv_datagram(sk
,flags
,noblock
,&err
);
489 msg
->msg_flags
|= MSG_TRUNC
;
493 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, copied
);
497 sk
->stamp
=skb
->stamp
;
499 /* Copy the address. */
501 sin
->sin_family
= AF_INET
;
502 sin
->sin_addr
.s_addr
= skb
->nh
.iph
->saddr
;
504 if (sk
->ip_cmsg_flags
)
505 ip_cmsg_recv(msg
, skb
);
507 skb_free_datagram(sk
, skb
);
508 return (err
? : copied
);
511 static int raw_init(struct sock
*sk
)
513 struct raw_opt
*tp
= &(sk
->tp_pinfo
.tp_raw4
);
514 if (sk
->num
== IPPROTO_ICMP
)
515 memset(&tp
->filter
, 0, sizeof(tp
->filter
));
519 static int raw_seticmpfilter(struct sock
*sk
, char *optval
, int optlen
)
521 if (optlen
> sizeof(struct icmp_filter
))
522 optlen
= sizeof(struct icmp_filter
);
523 if (copy_from_user(&sk
->tp_pinfo
.tp_raw4
.filter
, optval
, optlen
))
528 static int raw_geticmpfilter(struct sock
*sk
, char *optval
, int *optlen
)
532 if (get_user(len
,optlen
))
534 if (len
> sizeof(struct icmp_filter
))
535 len
= sizeof(struct icmp_filter
);
536 if (put_user(len
, optlen
))
538 if (copy_to_user(optval
, &sk
->tp_pinfo
.tp_raw4
.filter
, len
))
543 static int raw_setsockopt(struct sock
*sk
, int level
, int optname
,
544 char *optval
, int optlen
)
546 if (level
!= SOL_RAW
)
547 return ip_setsockopt(sk
, level
, optname
, optval
, optlen
);
551 if (sk
->num
!= IPPROTO_ICMP
)
553 return raw_seticmpfilter(sk
, optval
, optlen
);
559 static int raw_getsockopt(struct sock
*sk
, int level
, int optname
,
560 char *optval
, int *optlen
)
562 if (level
!= SOL_RAW
)
563 return ip_getsockopt(sk
, level
, optname
, optval
, optlen
);
567 if (sk
->num
!= IPPROTO_ICMP
)
569 return raw_geticmpfilter(sk
, optval
, optlen
);
575 struct proto raw_prot
= {
576 (struct sock
*)&raw_prot
, /* sklist_next */
577 (struct sock
*)&raw_prot
, /* sklist_prev */
578 raw_close
, /* close */
579 udp_connect
, /* connect */
581 NULL
, /* retransmit */
582 NULL
, /* write_wakeup */
583 NULL
, /* read_wakeup */
584 datagram_poll
, /* poll */
585 #ifdef CONFIG_IP_MROUTE
586 ipmr_ioctl
, /* ioctl */
593 raw_setsockopt
, /* setsockopt */
594 raw_getsockopt
, /* getsockopt */
595 raw_sendmsg
, /* sendmsg */
596 raw_recvmsg
, /* recvmsg */
598 raw_rcv_skb
, /* backlog_rcv */
599 raw_v4_hash
, /* hash */
600 raw_v4_unhash
, /* unhash */
601 raw_v4_rehash
, /* rehash */
602 NULL
, /* good_socknum */
603 NULL
, /* verify_bind */
604 128, /* max_header */