Import 2.3.16
[davej-history.git] / net / ipv4 / ip_fragment.c
blob4e6ffef060ff899d44082c18ca99d66edcd121ca
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The IP fragmentation functionality.
7 *
8 * Version: $Id: ip_fragment.c,v 1.45 1999/08/30 10:17:10 davem Exp $
10 * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox <Alan.Cox@linux.org>
13 * Fixes:
14 * Alan Cox : Split from ip.c , see ip_input.c for history.
15 * David S. Miller : Begin massive cleanup...
16 * Andi Kleen : Add sysctls.
17 * xxxx : Overlapfrag bug.
18 * Ultima : ip_expire() kernel panic.
19 * Bill Hawes : Frag accounting and evictor fixes.
20 * John McDonald : 0 length frag bug.
23 #include <linux/config.h>
24 #include <linux/types.h>
25 #include <linux/mm.h>
26 #include <linux/sched.h>
27 #include <linux/skbuff.h>
28 #include <linux/ip.h>
29 #include <linux/icmp.h>
30 #include <linux/netdevice.h>
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <linux/tcp.h>
35 #include <linux/udp.h>
36 #include <linux/inet.h>
37 #include <linux/netfilter_ipv4.h>
39 /* Fragment cache limits. We will commit 256K at one time. Should we
40 * cross that limit we will prune down to 192K. This should cope with
41 * even the most extreme cases without allowing an attacker to measurably
42 * harm machine performance.
44 int sysctl_ipfrag_high_thresh = 256*1024;
45 int sysctl_ipfrag_low_thresh = 192*1024;
47 int sysctl_ipfrag_time = IP_FRAG_TIME;
49 /* Describe an IP fragment. */
50 struct ipfrag {
51 int offset; /* offset of fragment in IP datagram */
52 int end; /* last byte of data in datagram */
53 int len; /* length of this fragment */
54 struct sk_buff *skb; /* complete received fragment */
55 unsigned char *ptr; /* pointer into real fragment data */
56 struct ipfrag *next; /* linked list pointers */
57 struct ipfrag *prev;
60 /* Describe an entry in the "incomplete datagrams" queue. */
61 struct ipq {
62 struct iphdr *iph; /* pointer to IP header */
63 struct ipq *next; /* linked list pointers */
64 struct ipfrag *fragments; /* linked list of received fragments */
65 int len; /* total length of original datagram */
66 short ihlen; /* length of the IP header */
67 struct timer_list timer; /* when will this queue expire? */
68 struct ipq **pprev;
69 struct net_device *dev; /* Device - for icmp replies */
72 #define IPQ_HASHSZ 64
74 static struct ipq *ipq_hash[IPQ_HASHSZ];
75 static spinlock_t ipfrag_lock = SPIN_LOCK_UNLOCKED;
77 #define ipqhashfn(id, saddr, daddr, prot) \
78 ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
80 atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
82 /* Memory Tracking Functions. */
83 extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
85 atomic_sub(skb->truesize, &ip_frag_mem);
86 kfree_skb(skb);
89 extern __inline__ void frag_kfree_s(void *ptr, int len)
91 atomic_sub(len, &ip_frag_mem);
92 kfree(ptr);
95 extern __inline__ void *frag_kmalloc(int size, int pri)
97 void *vp = kmalloc(size, pri);
99 if(!vp)
100 return NULL;
101 atomic_add(size, &ip_frag_mem);
102 return vp;
105 /* Create a new fragment entry. */
106 static struct ipfrag *ip_frag_create(int offset, int end,
107 struct sk_buff *skb, unsigned char *ptr)
109 struct ipfrag *fp;
111 fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
112 if (fp == NULL)
113 goto out_nomem;
115 /* Fill in the structure. */
116 fp->offset = offset;
117 fp->end = end;
118 fp->len = end - offset;
119 fp->skb = skb;
120 fp->ptr = ptr;
121 fp->next = fp->prev = NULL;
123 /* Charge for the SKB as well. */
124 atomic_add(skb->truesize, &ip_frag_mem);
126 return(fp);
128 out_nomem:
129 NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n"));
130 return(NULL);
133 /* Find the correct entry in the "incomplete datagrams" queue for
134 * this IP datagram, and return the queue entry address if found.
136 static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
138 __u16 id = iph->id;
139 __u32 saddr = iph->saddr;
140 __u32 daddr = iph->daddr;
141 __u8 protocol = iph->protocol;
142 unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
143 struct ipq *qp;
145 /* We are always in BH context, and protected by the
146 * ipfrag lock.
148 for(qp = ipq_hash[hash]; qp; qp = qp->next) {
149 if(qp->iph->id == id &&
150 qp->iph->saddr == saddr &&
151 qp->iph->daddr == daddr &&
152 qp->iph->protocol == protocol) {
153 del_timer(&qp->timer);
154 break;
157 return qp;
160 /* Remove an entry from the "incomplete datagrams" queue, either
161 * because we completed, reassembled and processed it, or because
162 * it timed out.
164 * This is called _only_ from BH contexts with the ipfrag lock held,
165 * on packet reception processing and from frag queue expiration
166 * timers. -DaveM
168 static void ip_free(struct ipq *qp)
170 struct ipfrag *fp;
172 /* Stop the timer for this entry. */
173 del_timer(&qp->timer);
175 /* Remove this entry from the "incomplete datagrams" queue. */
176 if(qp->next)
177 qp->next->pprev = qp->pprev;
178 *qp->pprev = qp->next;
180 /* Release all fragment data. */
181 fp = qp->fragments;
182 while (fp) {
183 struct ipfrag *xp = fp->next;
185 frag_kfree_skb(fp->skb);
186 frag_kfree_s(fp, sizeof(struct ipfrag));
187 fp = xp;
190 /* Release the IP header. */
191 frag_kfree_s(qp->iph, 64 + 8);
193 /* Finally, release the queue descriptor itself. */
194 frag_kfree_s(qp, sizeof(struct ipq));
198 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
200 static void ip_expire(unsigned long arg)
202 struct ipq *qp = (struct ipq *) arg;
204 spin_lock(&ipfrag_lock);
205 if(!qp->fragments)
207 #ifdef IP_EXPIRE_DEBUG
208 printk("warning: possible ip-expire attack\n");
209 #endif
210 goto out;
213 /* Send an ICMP "Fragment Reassembly Timeout" message. */
214 ip_statistics.IpReasmTimeout++;
215 ip_statistics.IpReasmFails++;
216 icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
218 out:
219 /* Nuke the fragment queue. */
220 ip_free(qp);
221 spin_unlock(&ipfrag_lock);
224 /* Memory limiting on fragments. Evictor trashes the oldest
225 * fragment queue until we are back under the low threshold.
227 * We are always called in BH with the ipfrag lock held.
229 static void ip_evictor(void)
231 int i, progress;
233 restart:
234 progress = 0;
235 /* FIXME: Make LRU queue of frag heads. -DaveM */
236 for (i = 0; i < IPQ_HASHSZ; i++) {
237 struct ipq *qp;
238 if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
239 return;
240 qp = ipq_hash[i];
241 if (qp) {
242 /* find the oldest queue for this hash bucket */
243 while (qp->next)
244 qp = qp->next;
245 ip_free(qp);
246 progress = 1;
249 if (progress)
250 goto restart;
251 panic("ip_evictor: memcount");
254 /* Add an entry to the 'ipq' queue for a newly received IP datagram.
255 * We will (hopefully :-) receive all other fragments of this datagram
256 * in time, so we just create a queue for this datagram, in which we
257 * will insert the received fragments at their respective positions.
259 static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
261 struct ipq *qp;
262 unsigned int hash;
263 int ihlen;
265 qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC);
266 if (qp == NULL)
267 goto out_nomem;
269 /* Allocate memory for the IP header (plus 8 octets for ICMP). */
270 ihlen = iph->ihl * 4;
272 qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC);
273 if (qp->iph == NULL)
274 goto out_free;
276 memcpy(qp->iph, iph, ihlen + 8);
277 qp->len = 0;
278 qp->ihlen = ihlen;
279 qp->fragments = NULL;
280 qp->dev = skb->dev;
282 /* Initialize a timer for this entry. */
283 init_timer(&qp->timer);
284 qp->timer.expires = 0; /* (to be set later) */
285 qp->timer.data = (unsigned long) qp; /* pointer to queue */
286 qp->timer.function = ip_expire; /* expire function */
288 /* Add this entry to the queue. */
289 hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
291 /* In a BH context and ipfrag lock is held. -DaveM */
292 if((qp->next = ipq_hash[hash]) != NULL)
293 qp->next->pprev = &qp->next;
294 ipq_hash[hash] = qp;
295 qp->pprev = &ipq_hash[hash];
297 return qp;
299 out_free:
300 frag_kfree_s(qp, sizeof(struct ipq));
301 out_nomem:
302 NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
303 return(NULL);
306 /* See if a fragment queue is complete. */
307 static int ip_done(struct ipq *qp)
309 struct ipfrag *fp;
310 int offset;
312 /* Only possible if we received the final fragment. */
313 if (qp->len == 0)
314 return 0;
316 /* Check all fragment offsets to see if they connect. */
317 fp = qp->fragments;
318 offset = 0;
319 while (fp) {
320 if (fp->offset > offset)
321 return(0); /* fragment(s) missing */
322 offset = fp->end;
323 fp = fp->next;
326 /* All fragments are present. */
327 return 1;
330 /* Build a new IP datagram from all its fragments.
332 * FIXME: We copy here because we lack an effective way of handling lists
333 * of bits on input. Until the new skb data handling is in I'm not going
334 * to touch this with a bargepole.
336 static struct sk_buff *ip_glue(struct ipq *qp)
338 struct sk_buff *skb;
339 struct iphdr *iph;
340 struct ipfrag *fp;
341 unsigned char *ptr;
342 int count, len;
344 /* Allocate a new buffer for the datagram. */
345 len = qp->ihlen + qp->len;
347 if(len > 65535)
348 goto out_oversize;
350 skb = dev_alloc_skb(len);
351 if (!skb)
352 goto out_nomem;
354 /* Fill in the basic details. */
355 skb->mac.raw = ptr = skb->data;
356 skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len);
358 /* Copy the original IP headers into the new buffer. */
359 memcpy(ptr, qp->iph, qp->ihlen);
360 ptr += qp->ihlen;
362 /* Copy the data portions of all fragments into the new buffer. */
363 fp = qp->fragments;
364 count = qp->ihlen;
365 while(fp) {
366 if ((fp->len <= 0) || ((count + fp->len) > skb->len))
367 goto out_invalid;
368 memcpy((ptr + fp->offset), fp->ptr, fp->len);
369 if (count == qp->ihlen) {
370 skb->dst = dst_clone(fp->skb->dst);
371 skb->dev = fp->skb->dev;
373 count += fp->len;
374 fp = fp->next;
377 skb->pkt_type = qp->fragments->skb->pkt_type;
378 skb->protocol = qp->fragments->skb->protocol;
380 * Clearly bogus, because security markings of the individual
381 * fragments should have been checked for consistency before
382 * gluing, and intermediate coalescing of fragments may have
383 * taken place in ip_defrag() before ip_glue() ever got called.
384 * If we're not going to do the consistency checking, we might
385 * as well take the value associated with the first fragment.
386 * --rct
388 skb->security = qp->fragments->skb->security;
390 #ifdef CONFIG_NETFILTER_DEBUG
391 skb->nf_debug = qp->fragments->skb->nf_debug;
392 #endif
394 /* Done with all fragments. Fixup the new IP header. */
395 iph = skb->nh.iph;
396 iph->frag_off = 0;
397 iph->tot_len = htons(count);
398 ip_statistics.IpReasmOKs++;
399 return skb;
401 out_invalid:
402 NETDEBUG(printk(KERN_ERR
403 "Invalid fragment list: Fragment over size.\n"));
404 kfree_skb(skb);
405 goto out_fail;
406 out_nomem:
407 NETDEBUG(printk(KERN_ERR
408 "IP: queue_glue: no memory for gluing queue %p\n",
409 qp));
410 goto out_fail;
411 out_oversize:
412 if (net_ratelimit())
413 printk(KERN_INFO
414 "Oversized IP packet from %d.%d.%d.%d.\n",
415 NIPQUAD(qp->iph->saddr));
416 out_fail:
417 ip_statistics.IpReasmFails++;
418 return NULL;
421 /* Process an incoming IP datagram fragment. */
422 struct sk_buff *ip_defrag(struct sk_buff *skb)
424 struct iphdr *iph = skb->nh.iph;
425 struct ipfrag *prev, *next, *tmp, *tfp;
426 struct ipq *qp;
427 unsigned char *ptr;
428 int flags, offset;
429 int i, ihl, end;
431 ip_statistics.IpReasmReqds++;
433 spin_lock(&ipfrag_lock);
435 /* Start by cleaning up the memory. */
436 if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
437 ip_evictor();
440 * Look for the entry for this IP datagram in the
441 * "incomplete datagrams" queue. If found, the
442 * timer is removed.
444 qp = ip_find(iph, skb->dst);
446 /* Is this a non-fragmented datagram? */
447 offset = ntohs(iph->frag_off);
448 flags = offset & ~IP_OFFSET;
449 offset &= IP_OFFSET;
451 offset <<= 3; /* offset is in 8-byte chunks */
452 ihl = iph->ihl * 4;
455 * Check whether to create a fresh queue entry. If the
456 * queue already exists, its timer will be restarted as
457 * long as we continue to receive fragments.
459 if (qp) {
460 /* ANK. If the first fragment is received,
461 * we should remember the correct IP header (with options)
463 if (offset == 0) {
464 /* Fragmented frame replaced by unfragmented copy? */
465 if ((flags & IP_MF) == 0)
466 goto out_freequeue;
467 qp->ihlen = ihl;
468 memcpy(qp->iph, iph, (ihl + 8));
470 } else {
471 /* Fragmented frame replaced by unfragmented copy? */
472 if ((offset == 0) && ((flags & IP_MF) == 0))
473 goto out_skb;
475 /* If we failed to create it, then discard the frame. */
476 qp = ip_create(skb, iph);
477 if (!qp)
478 goto out_freeskb;
481 /* Attempt to construct an oversize packet. */
482 if((ntohs(iph->tot_len) + ((int) offset)) > 65535)
483 goto out_oversize;
485 /* Determine the position of this fragment. */
486 end = offset + ntohs(iph->tot_len) - ihl;
488 /* Is this the final fragment? */
489 if ((flags & IP_MF) == 0)
490 qp->len = end;
492 /* Find out which fragments are in front and at the back of us
493 * in the chain of fragments so far. We must know where to put
494 * this fragment, right?
496 prev = NULL;
497 for(next = qp->fragments; next != NULL; next = next->next) {
498 if (next->offset >= offset)
499 break; /* bingo! */
500 prev = next;
503 /* Point into the IP datagram 'data' part. */
504 ptr = skb->data + ihl;
506 /* We found where to put this one. Check for overlap with
507 * preceding fragment, and, if needed, align things so that
508 * any overlaps are eliminated.
510 if ((prev != NULL) && (offset < prev->end)) {
511 i = prev->end - offset;
512 offset += i; /* ptr into datagram */
513 ptr += i; /* ptr into fragment data */
516 /* Look for overlap with succeeding segments.
517 * If we can merge fragments, do it.
519 for (tmp = next; tmp != NULL; tmp = tfp) {
520 tfp = tmp->next;
521 if (tmp->offset >= end)
522 break; /* no overlaps at all */
524 i = end - next->offset; /* overlap is 'i' bytes */
525 tmp->len -= i; /* so reduce size of */
526 tmp->offset += i; /* next fragment */
527 tmp->ptr += i;
529 /* If we get a frag size of <= 0, remove it and the packet
530 * that it goes with.
532 if (tmp->len <= 0) {
533 if (tmp->prev != NULL)
534 tmp->prev->next = tmp->next;
535 else
536 qp->fragments = tmp->next;
538 if (tmp->next != NULL)
539 tmp->next->prev = tmp->prev;
541 /* We have killed the original next frame. */
542 next = tfp;
544 frag_kfree_skb(tmp->skb);
545 frag_kfree_s(tmp, sizeof(struct ipfrag));
550 * Create a fragment to hold this skb.
551 * No memory to save the fragment? throw the lot ...
553 tfp = ip_frag_create(offset, end, skb, ptr);
554 if (!tfp)
555 goto out_freeskb;
557 /* Insert this fragment in the chain of fragments. */
558 tfp->prev = prev;
559 tfp->next = next;
560 if (prev != NULL)
561 prev->next = tfp;
562 else
563 qp->fragments = tfp;
565 if (next != NULL)
566 next->prev = tfp;
568 /* OK, so we inserted this new fragment into the chain.
569 * Check if we now have a full IP datagram which we can
570 * bump up to the IP layer...
572 if (ip_done(qp)) {
573 /* Glue together the fragments. */
574 skb = ip_glue(qp);
575 /* Free the queue entry. */
576 out_freequeue:
577 ip_free(qp);
578 out_skb:
579 spin_unlock(&ipfrag_lock);
580 return skb;
584 * The queue is still active ... reset its timer.
586 out_timer:
587 mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
588 out:
589 spin_unlock(&ipfrag_lock);
590 return NULL;
593 * Error exits ... we need to reset the timer if there's a queue.
595 out_oversize:
596 if (net_ratelimit())
597 printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n",
598 NIPQUAD(iph->saddr));
599 /* the skb isn't in a fragment, so fall through to free it */
600 out_freeskb:
601 kfree_skb(skb);
602 ip_statistics.IpReasmFails++;
603 if (qp)
604 goto out_timer;
605 goto out;