Import 2.3.4
[davej-history.git] / net / ipv4 / raw.c
blobdd2e7555e64a39bfbc179bbb4a03e7d7ee271bef
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * RAW - implementation of IP "raw" sockets.
8 * Version: $Id: raw.c,v 1.41 1999/05/30 01:16:19 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
13 * Fixes:
14 * Alan Cox : verify_area() fixed up
15 * Alan Cox : ICMP error handling
16 * Alan Cox : EMSGSIZE if you send too big a packet
17 * Alan Cox : Now uses generic datagrams and shared skbuff
18 * library. No more peek crashes, no more backlogs
19 * Alan Cox : Checks sk->broadcast.
20 * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
21 * Alan Cox : Raw passes ip options too
22 * Alan Cox : Setsocketopt added
23 * Alan Cox : Fixed error return for broadcasts
24 * Alan Cox : Removed wake_up calls
25 * Alan Cox : Use ttl/tos
26 * Alan Cox : Cleaned up old debugging
27 * Alan Cox : Use new kernel side addresses
28 * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
29 * Alan Cox : BSD style RAW socket demultiplexing.
30 * Alan Cox : Beginnings of mrouted support.
31 * Alan Cox : Added IP_HDRINCL option.
32 * Alan Cox : Skip broadcast check if BSDism set.
33 * David S. Miller : New socket lookup architecture.
35 * This program is free software; you can redistribute it and/or
36 * modify it under the terms of the GNU General Public License
37 * as published by the Free Software Foundation; either version
38 * 2 of the License, or (at your option) any later version.
41 #include <linux/config.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
44 #include <linux/types.h>
45 #include <linux/sched.h>
46 #include <linux/errno.h>
47 #include <linux/timer.h>
48 #include <linux/mm.h>
49 #include <linux/kernel.h>
50 #include <linux/fcntl.h>
51 #include <linux/socket.h>
52 #include <linux/in.h>
53 #include <linux/inet.h>
54 #include <linux/netdevice.h>
55 #include <linux/mroute.h>
56 #include <net/ip.h>
57 #include <net/protocol.h>
58 #include <linux/skbuff.h>
59 #include <net/sock.h>
60 #include <net/icmp.h>
61 #include <net/udp.h>
62 #include <net/raw.h>
63 #include <net/checksum.h>
65 #ifdef CONFIG_IP_MROUTE
66 struct sock *mroute_socket=NULL;
67 #endif
69 struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE];
71 static void raw_v4_hash(struct sock *sk)
73 struct sock **skp;
74 int num = sk->num;
76 num &= (RAWV4_HTABLE_SIZE - 1);
77 skp = &raw_v4_htable[num];
78 SOCKHASH_LOCK_WRITE();
79 sk->next = *skp;
80 *skp = sk;
81 sk->hashent = num;
82 SOCKHASH_UNLOCK_WRITE();
85 static void raw_v4_unhash(struct sock *sk)
87 struct sock **skp;
88 int num = sk->num;
90 num &= (RAWV4_HTABLE_SIZE - 1);
91 skp = &raw_v4_htable[num];
93 SOCKHASH_LOCK_WRITE();
94 while(*skp != NULL) {
95 if(*skp == sk) {
96 *skp = sk->next;
97 break;
99 skp = &((*skp)->next);
101 SOCKHASH_UNLOCK_WRITE();
104 static void raw_v4_rehash(struct sock *sk)
106 struct sock **skp;
107 int num = sk->num;
108 int oldnum = sk->hashent;
110 num &= (RAWV4_HTABLE_SIZE - 1);
111 skp = &raw_v4_htable[oldnum];
113 SOCKHASH_LOCK_WRITE();
114 while(*skp != NULL) {
115 if(*skp == sk) {
116 *skp = sk->next;
117 break;
119 skp = &((*skp)->next);
121 sk->next = raw_v4_htable[num];
122 raw_v4_htable[num] = sk;
123 sk->hashent = num;
124 SOCKHASH_UNLOCK_WRITE();
127 static __inline__ struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
128 unsigned long raddr, unsigned long laddr,
129 int dif)
131 struct sock *s = sk;
133 for(s = sk; s; s = s->next) {
134 if((s->num == num) &&
135 !(s->dead && (s->state == TCP_CLOSE)) &&
136 !(s->daddr && s->daddr != raddr) &&
137 !(s->rcv_saddr && s->rcv_saddr != laddr) &&
138 !(s->bound_dev_if && s->bound_dev_if != dif))
139 break; /* gotcha */
141 return s;
144 struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
145 unsigned long raddr, unsigned long laddr,
146 int dif)
148 SOCKHASH_LOCK_READ();
149 sk = __raw_v4_lookup(sk, num, raddr, laddr, dif);
150 SOCKHASH_UNLOCK_READ();
152 return sk;
156 * 0 - deliver
157 * 1 - block
159 static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
161 int type;
163 type = skb->h.icmph->type;
164 if (type < 32)
165 return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
167 /* Do not block unknown ICMP types */
168 return 0;
171 /* IP input processing comes here for RAW socket delivery.
172 * This is fun as to avoid copies we want to make no surplus
173 * copies.
175 * RFC 1122: SHOULD pass TOS value up to the transport layer.
176 * -> It does. And not only TOS, but all IP header.
178 struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
180 struct sock *sk;
182 SOCKHASH_LOCK_READ_BH();
183 if ((sk = raw_v4_htable[hash]) == NULL)
184 goto out;
185 sk = __raw_v4_lookup(sk, iph->protocol,
186 iph->saddr, iph->daddr,
187 skb->dev->ifindex);
188 while(sk != NULL) {
189 struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
190 iph->saddr, iph->daddr,
191 skb->dev->ifindex);
193 if (iph->protocol != IPPROTO_ICMP ||
194 ! icmp_filter(sk, skb)) {
195 struct sk_buff *clone;
197 if(sknext == NULL)
198 break;
199 clone = skb_clone(skb, GFP_ATOMIC);
200 if(clone) {
201 SOCKHASH_UNLOCK_READ_BH();
202 raw_rcv(sk, clone);
203 SOCKHASH_LOCK_READ_BH();
206 sk = sknext;
208 out:
209 SOCKHASH_UNLOCK_READ_BH();
211 return sk;
214 void raw_err (struct sock *sk, struct sk_buff *skb)
216 int type = skb->h.icmph->type;
217 int code = skb->h.icmph->code;
218 u32 info = 0;
219 int err = 0;
220 int harderr = 0;
222 /* Report error on raw socket, if:
223 1. User requested ip_recverr.
224 2. Socket is connected (otherwise the error indication
225 is useless without ip_recverr and error is hard.
227 if (!sk->ip_recverr && sk->state != TCP_ESTABLISHED)
228 return;
230 switch (type) {
231 default:
232 case ICMP_TIME_EXCEEDED:
233 err = EHOSTUNREACH;
234 break;
235 case ICMP_SOURCE_QUENCH:
236 return;
237 case ICMP_PARAMETERPROB:
238 err = EPROTO;
239 info = ntohl(skb->h.icmph->un.gateway)>>24;
240 harderr = 1;
241 break;
242 case ICMP_DEST_UNREACH:
243 err = EHOSTUNREACH;
244 if (code > NR_ICMP_UNREACH)
245 break;
246 err = icmp_err_convert[code].errno;
247 harderr = icmp_err_convert[code].fatal;
248 if (code == ICMP_FRAG_NEEDED) {
249 harderr = (sk->ip_pmtudisc != IP_PMTUDISC_DONT);
250 err = EMSGSIZE;
251 info = ntohs(skb->h.icmph->un.frag.mtu);
255 if (sk->ip_recverr)
256 ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1));
258 if (sk->ip_recverr || harderr) {
259 sk->err = err;
260 sk->error_report(sk);
264 static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
266 /* Charge it to the socket. */
268 if (sock_queue_rcv_skb(sk,skb)<0)
270 ip_statistics.IpInDiscards++;
271 kfree_skb(skb);
272 return -1;
275 ip_statistics.IpInDelivers++;
276 return 0;
280 * This should be the easiest of all, all we do is
281 * copy it into a buffer. All demultiplexing is done
282 * in ip.c
285 int raw_rcv(struct sock *sk, struct sk_buff *skb)
287 /* Now we need to copy this into memory. */
288 skb_trim(skb, ntohs(skb->nh.iph->tot_len));
290 skb->h.raw = skb->nh.raw;
292 raw_rcv_skb(sk, skb);
293 return 0;
296 struct rawfakehdr
298 struct iovec *iov;
299 u32 saddr;
303 * Send a RAW IP packet.
307 * Callback support is trivial for SOCK_RAW
310 static int raw_getfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen)
312 struct rawfakehdr *rfh = (struct rawfakehdr *) p;
313 return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
317 * IPPROTO_RAW needs extra work.
320 static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen)
322 struct rawfakehdr *rfh = (struct rawfakehdr *) p;
324 if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
325 return -EFAULT;
327 if (offset==0) {
328 struct iphdr *iph = (struct iphdr *)to;
329 if (!iph->saddr)
330 iph->saddr = rfh->saddr;
331 iph->check=0;
332 iph->tot_len=htons(fraglen); /* This is right as you can't frag
333 RAW packets */
335 * Deliberate breach of modularity to keep
336 * ip_build_xmit clean (well less messy).
338 if (!iph->id)
339 iph->id = htons(ip_id_count++);
340 iph->check=ip_fast_csum((unsigned char *)iph, iph->ihl);
342 return 0;
345 static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
347 struct ipcm_cookie ipc;
348 struct rawfakehdr rfh;
349 struct rtable *rt = NULL;
350 int free = 0;
351 u32 daddr;
352 u8 tos;
353 int err;
355 /* This check is ONLY to check for arithmetic overflow
356 on integer(!) len. Not more! Real check will be made
357 in ip_build_xmit --ANK
359 BTW socket.c -> af_*.c -> ... make multiple
360 invalid conversions size_t -> int. We MUST repair it f.e.
361 by replacing all of them with size_t and revise all
362 the places sort of len += sizeof(struct iphdr)
363 If len was ULONG_MAX-10 it would be cathastrophe --ANK
366 if (len < 0 || len > 0xFFFF)
367 return -EMSGSIZE;
370 * Check the flags.
373 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
374 return -EOPNOTSUPP;
376 if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT))
377 return(-EINVAL);
380 * Get and verify the address.
383 if (msg->msg_namelen) {
384 struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
385 if (msg->msg_namelen < sizeof(*usin))
386 return(-EINVAL);
387 if (usin->sin_family != AF_INET) {
388 static int complained;
389 if (!complained++)
390 printk(KERN_INFO "%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current->comm);
391 if (usin->sin_family)
392 return -EINVAL;
394 daddr = usin->sin_addr.s_addr;
395 /* ANK: I did not forget to get protocol from port field.
396 * I just do not know, who uses this weirdness.
397 * IP_HDRINCL is much more convenient.
399 } else {
400 if (sk->state != TCP_ESTABLISHED)
401 return(-EINVAL);
402 daddr = sk->daddr;
405 ipc.addr = sk->saddr;
406 ipc.opt = NULL;
407 ipc.oif = sk->bound_dev_if;
409 if (msg->msg_controllen) {
410 int tmp = ip_cmsg_send(msg, &ipc);
411 if (tmp)
412 return tmp;
413 if (ipc.opt)
414 free=1;
417 rfh.saddr = ipc.addr;
418 ipc.addr = daddr;
420 if (!ipc.opt)
421 ipc.opt = sk->opt;
423 if (ipc.opt) {
424 err = -EINVAL;
425 /* Linux does not mangle headers on raw sockets,
426 * so that IP options + IP_HDRINCL is non-sense.
428 if (sk->ip_hdrincl)
429 goto done;
430 if (ipc.opt->srr) {
431 if (!daddr)
432 goto done;
433 daddr = ipc.opt->faddr;
436 tos = RT_TOS(sk->ip_tos) | sk->localroute;
437 if (msg->msg_flags&MSG_DONTROUTE)
438 tos |= RTO_ONLINK;
440 if (MULTICAST(daddr)) {
441 if (!ipc.oif)
442 ipc.oif = sk->ip_mc_index;
443 if (!rfh.saddr)
444 rfh.saddr = sk->ip_mc_addr;
447 err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif);
449 if (err)
450 goto done;
452 err = -EACCES;
453 if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
454 goto done;
456 rfh.iov = msg->msg_iov;
457 rfh.saddr = rt->rt_src;
458 if (!ipc.addr)
459 ipc.addr = rt->rt_dst;
460 err=ip_build_xmit(sk, sk->ip_hdrincl ? raw_getrawfrag : raw_getfrag,
461 &rfh, len, &ipc, rt, msg->msg_flags);
463 done:
464 if (free)
465 kfree(ipc.opt);
466 ip_rt_put(rt);
468 return err<0 ? err : len;
471 static void raw_close(struct sock *sk, long timeout)
473 bh_lock_sock(sk);
475 /* Observation: when raw_close is called, processes have
476 no access to socket anymore. But net still has.
477 Step one, detach it from networking:
479 A. Remove from hash tables.
481 sk->state = TCP_CLOSE;
482 raw_v4_unhash(sk);
484 B. Raw sockets may have direct kernel refereneces. Kill them.
486 ip_ra_control(sk, 0, NULL);
488 /* In this point socket cannot receive new packets anymore */
491 /* But we still have packets pending on receive
492 queue and probably, our own packets waiting in device queues.
493 sock_destroy will drain receive queue, but transmitted
494 packets will delay socket destruction.
495 Set sk->dead=1 in order to prevent wakeups, when these
496 packet will be freed.
498 sk->dead=1;
499 destroy_sock(sk);
501 /* That's all. No races here. */
504 /* This gets rid of all the nasties in af_inet. -DaveM */
505 static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
507 struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
508 int chk_addr_ret;
510 if((sk->state != TCP_CLOSE) || (addr_len < sizeof(struct sockaddr_in)))
511 return -EINVAL;
512 chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
513 if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
514 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
515 #ifdef CONFIG_IP_TRANSPARENT_PROXY
516 /* Superuser may bind to any address to allow transparent proxying. */
517 if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN))
518 #endif
519 return -EADDRNOTAVAIL;
521 sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
522 if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
523 sk->saddr = 0; /* Use device */
524 dst_release(xchg(&sk->dst_cache, NULL));
525 return 0;
529 * This should be easy, if there is something there
530 * we return it, otherwise we block.
533 int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len,
534 int noblock, int flags,int *addr_len)
536 int copied=0;
537 struct sk_buff *skb;
538 int err;
539 struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
541 if (flags & MSG_OOB)
542 return -EOPNOTSUPP;
544 if (addr_len)
545 *addr_len=sizeof(*sin);
547 if (flags & MSG_ERRQUEUE)
548 return ip_recv_error(sk, msg, len);
550 skb=skb_recv_datagram(sk,flags,noblock,&err);
551 if(skb==NULL)
552 return err;
554 copied = skb->len;
555 if (len < copied)
557 msg->msg_flags |= MSG_TRUNC;
558 copied = len;
561 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
562 if (err)
563 goto done;
565 sk->stamp=skb->stamp;
567 /* Copy the address. */
568 if (sin) {
569 sin->sin_family = AF_INET;
570 sin->sin_addr.s_addr = skb->nh.iph->saddr;
572 if (sk->ip_cmsg_flags)
573 ip_cmsg_recv(msg, skb);
574 done:
575 skb_free_datagram(sk, skb);
576 return (err ? : copied);
579 static int raw_init(struct sock *sk)
581 struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
582 if (sk->num == IPPROTO_ICMP)
583 memset(&tp->filter, 0, sizeof(tp->filter));
584 return 0;
587 static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen)
589 if (optlen > sizeof(struct icmp_filter))
590 optlen = sizeof(struct icmp_filter);
591 if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen))
592 return -EFAULT;
593 return 0;
596 static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen)
598 int len;
600 if (get_user(len,optlen))
601 return -EFAULT;
602 if (len > sizeof(struct icmp_filter))
603 len = sizeof(struct icmp_filter);
604 if (put_user(len, optlen))
605 return -EFAULT;
606 if (copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len))
607 return -EFAULT;
608 return 0;
611 static int raw_setsockopt(struct sock *sk, int level, int optname,
612 char *optval, int optlen)
614 if (level != SOL_RAW)
615 return ip_setsockopt(sk, level, optname, optval, optlen);
617 switch (optname) {
618 case ICMP_FILTER:
619 if (sk->num != IPPROTO_ICMP)
620 return -EOPNOTSUPP;
621 return raw_seticmpfilter(sk, optval, optlen);
624 return -ENOPROTOOPT;
627 static int raw_getsockopt(struct sock *sk, int level, int optname,
628 char *optval, int *optlen)
630 if (level != SOL_RAW)
631 return ip_getsockopt(sk, level, optname, optval, optlen);
633 switch (optname) {
634 case ICMP_FILTER:
635 if (sk->num != IPPROTO_ICMP)
636 return -EOPNOTSUPP;
637 return raw_geticmpfilter(sk, optval, optlen);
640 return -ENOPROTOOPT;
643 struct proto raw_prot = {
644 (struct sock *)&raw_prot, /* sklist_next */
645 (struct sock *)&raw_prot, /* sklist_prev */
646 raw_close, /* close */
647 udp_connect, /* connect */
648 NULL, /* accept */
649 NULL, /* retransmit */
650 NULL, /* write_wakeup */
651 NULL, /* read_wakeup */
652 datagram_poll, /* poll */
653 #ifdef CONFIG_IP_MROUTE
654 ipmr_ioctl, /* ioctl */
655 #else
656 NULL, /* ioctl */
657 #endif
658 raw_init, /* init */
659 NULL, /* destroy */
660 NULL, /* shutdown */
661 raw_setsockopt, /* setsockopt */
662 raw_getsockopt, /* getsockopt */
663 raw_sendmsg, /* sendmsg */
664 raw_recvmsg, /* recvmsg */
665 raw_bind, /* bind */
666 raw_rcv_skb, /* backlog_rcv */
667 raw_v4_hash, /* hash */
668 raw_v4_unhash, /* unhash */
669 raw_v4_rehash, /* rehash */
670 NULL, /* good_socknum */
671 NULL, /* verify_bind */
672 128, /* max_header */
673 0, /* retransmits */
674 "RAW", /* name */
675 0, /* inuse */
676 0 /* highestinuse */