pre-2.3.4..
[davej-history.git] / net / ipv4 / ip_fragment.c
blobf066e6073f5ba88f1dc32b8b23b51ccacb2c43f1
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The IP fragmentation functionality.
7 *
8 * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
10 * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox <Alan.Cox@linux.org>
13 * Fixes:
14 * Alan Cox : Split from ip.c , see ip_input.c for history.
15 * David S. Miller : Begin massive cleanup...
16 * Andi Kleen : Add sysctls.
17 * xxxx : Overlapfrag bug.
18 * Ultima : ip_expire() kernel panic.
19 * Bill Hawes : Frag accounting and evictor fixes.
20 * John McDonald : 0 length frag bug.
23 #include <linux/types.h>
24 #include <linux/mm.h>
25 #include <linux/sched.h>
26 #include <linux/skbuff.h>
27 #include <linux/ip.h>
28 #include <linux/icmp.h>
29 #include <linux/netdevice.h>
30 #include <net/sock.h>
31 #include <net/ip.h>
32 #include <net/icmp.h>
33 #include <linux/tcp.h>
34 #include <linux/udp.h>
35 #include <linux/inet.h>
36 #include <linux/firewall.h>
37 #include <linux/ip_fw.h>
39 /* Fragment cache limits. We will commit 256K at one time. Should we
40 * cross that limit we will prune down to 192K. This should cope with
41 * even the most extreme cases without allowing an attacker to measurably
42 * harm machine performance.
44 int sysctl_ipfrag_high_thresh = 256*1024;
45 int sysctl_ipfrag_low_thresh = 192*1024;
47 int sysctl_ipfrag_time = IP_FRAG_TIME;
49 /* Describe an IP fragment. */
50 struct ipfrag {
51 int offset; /* offset of fragment in IP datagram */
52 int end; /* last byte of data in datagram */
53 int len; /* length of this fragment */
54 struct sk_buff *skb; /* complete received fragment */
55 unsigned char *ptr; /* pointer into real fragment data */
56 struct ipfrag *next; /* linked list pointers */
57 struct ipfrag *prev;
60 /* Describe an entry in the "incomplete datagrams" queue. */
61 struct ipq {
62 struct iphdr *iph; /* pointer to IP header */
63 struct ipq *next; /* linked list pointers */
64 struct ipfrag *fragments; /* linked list of received fragments */
65 int len; /* total length of original datagram */
66 short ihlen; /* length of the IP header */
67 struct timer_list timer; /* when will this queue expire? */
68 struct ipq **pprev;
69 struct device *dev; /* Device - for icmp replies */
72 #define IPQ_HASHSZ 64
74 struct ipq *ipq_hash[IPQ_HASHSZ];
76 #define ipqhashfn(id, saddr, daddr, prot) \
77 ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
79 atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
81 /* Memory Tracking Functions. */
82 extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
84 atomic_sub(skb->truesize, &ip_frag_mem);
85 kfree_skb(skb);
88 extern __inline__ void frag_kfree_s(void *ptr, int len)
90 atomic_sub(len, &ip_frag_mem);
91 kfree(ptr);
94 extern __inline__ void *frag_kmalloc(int size, int pri)
96 void *vp = kmalloc(size, pri);
98 if(!vp)
99 return NULL;
100 atomic_add(size, &ip_frag_mem);
101 return vp;
104 /* Create a new fragment entry. */
105 static struct ipfrag *ip_frag_create(int offset, int end,
106 struct sk_buff *skb, unsigned char *ptr)
108 struct ipfrag *fp;
110 fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
111 if (fp == NULL)
112 goto out_nomem;
114 /* Fill in the structure. */
115 fp->offset = offset;
116 fp->end = end;
117 fp->len = end - offset;
118 fp->skb = skb;
119 fp->ptr = ptr;
120 fp->next = fp->prev = NULL;
122 /* Charge for the SKB as well. */
123 atomic_add(skb->truesize, &ip_frag_mem);
125 return(fp);
127 out_nomem:
128 NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n"));
129 return(NULL);
132 /* Find the correct entry in the "incomplete datagrams" queue for
133 * this IP datagram, and return the queue entry address if found.
135 static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
137 __u16 id = iph->id;
138 __u32 saddr = iph->saddr;
139 __u32 daddr = iph->daddr;
140 __u8 protocol = iph->protocol;
141 unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
142 struct ipq *qp;
144 /* Always, we are in a BH context, so no locking. -DaveM */
145 for(qp = ipq_hash[hash]; qp; qp = qp->next) {
146 if(qp->iph->id == id &&
147 qp->iph->saddr == saddr &&
148 qp->iph->daddr == daddr &&
149 qp->iph->protocol == protocol) {
150 del_timer(&qp->timer);
151 break;
154 return qp;
157 /* Remove an entry from the "incomplete datagrams" queue, either
158 * because we completed, reassembled and processed it, or because
159 * it timed out.
161 * This is called _only_ from BH contexts, on packet reception
162 * processing and from frag queue expiration timers. -DaveM
164 static void ip_free(struct ipq *qp)
166 struct ipfrag *fp;
168 /* Stop the timer for this entry. */
169 del_timer(&qp->timer);
171 /* Remove this entry from the "incomplete datagrams" queue. */
172 if(qp->next)
173 qp->next->pprev = qp->pprev;
174 *qp->pprev = qp->next;
176 /* Release all fragment data. */
177 fp = qp->fragments;
178 while (fp) {
179 struct ipfrag *xp = fp->next;
181 frag_kfree_skb(fp->skb);
182 frag_kfree_s(fp, sizeof(struct ipfrag));
183 fp = xp;
186 /* Release the IP header. */
187 frag_kfree_s(qp->iph, 64 + 8);
189 /* Finally, release the queue descriptor itself. */
190 frag_kfree_s(qp, sizeof(struct ipq));
194 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
196 static void ip_expire(unsigned long arg)
198 struct ipq *qp = (struct ipq *) arg;
200 if(!qp->fragments)
202 #ifdef IP_EXPIRE_DEBUG
203 printk("warning: possible ip-expire attack\n");
204 #endif
205 goto out;
208 /* Send an ICMP "Fragment Reassembly Timeout" message. */
209 ip_statistics.IpReasmTimeout++;
210 ip_statistics.IpReasmFails++;
211 icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
213 out:
214 /* Nuke the fragment queue. */
215 ip_free(qp);
218 /* Memory limiting on fragments. Evictor trashes the oldest
219 * fragment queue until we are back under the low threshold.
221 static void ip_evictor(void)
223 int i, progress;
225 restart:
226 progress = 0;
227 /* FIXME: Make LRU queue of frag heads. -DaveM */
228 for (i = 0; i < IPQ_HASHSZ; i++) {
229 struct ipq *qp;
230 if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
231 return;
232 /* We are in a BH context, so these queue
233 * accesses are safe. -DaveM
235 qp = ipq_hash[i];
236 if (qp) {
237 /* find the oldest queue for this hash bucket */
238 while (qp->next)
239 qp = qp->next;
240 ip_free(qp);
241 progress = 1;
244 if (progress)
245 goto restart;
246 panic("ip_evictor: memcount");
249 /* Add an entry to the 'ipq' queue for a newly received IP datagram.
250 * We will (hopefully :-) receive all other fragments of this datagram
251 * in time, so we just create a queue for this datagram, in which we
252 * will insert the received fragments at their respective positions.
254 static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
256 struct ipq *qp;
257 unsigned int hash;
258 int ihlen;
260 qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC);
261 if (qp == NULL)
262 goto out_nomem;
264 /* Allocate memory for the IP header (plus 8 octets for ICMP). */
265 ihlen = iph->ihl * 4;
267 qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC);
268 if (qp->iph == NULL)
269 goto out_free;
271 memcpy(qp->iph, iph, ihlen + 8);
272 qp->len = 0;
273 qp->ihlen = ihlen;
274 qp->fragments = NULL;
275 qp->dev = skb->dev;
277 /* Initialize a timer for this entry. */
278 init_timer(&qp->timer);
279 qp->timer.expires = 0; /* (to be set later) */
280 qp->timer.data = (unsigned long) qp; /* pointer to queue */
281 qp->timer.function = ip_expire; /* expire function */
283 /* Add this entry to the queue. */
284 hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
286 /* We are in a BH context, no locking necessary. -DaveM */
287 if((qp->next = ipq_hash[hash]) != NULL)
288 qp->next->pprev = &qp->next;
289 ipq_hash[hash] = qp;
290 qp->pprev = &ipq_hash[hash];
292 return qp;
294 out_free:
295 frag_kfree_s(qp, sizeof(struct ipq));
296 out_nomem:
297 NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
298 return(NULL);
301 /* See if a fragment queue is complete. */
302 static int ip_done(struct ipq *qp)
304 struct ipfrag *fp;
305 int offset;
307 /* Only possible if we received the final fragment. */
308 if (qp->len == 0)
309 return 0;
311 /* Check all fragment offsets to see if they connect. */
312 fp = qp->fragments;
313 offset = 0;
314 while (fp) {
315 if (fp->offset > offset)
316 return(0); /* fragment(s) missing */
317 offset = fp->end;
318 fp = fp->next;
321 /* All fragments are present. */
322 return 1;
325 /* Build a new IP datagram from all its fragments.
327 * FIXME: We copy here because we lack an effective way of handling lists
328 * of bits on input. Until the new skb data handling is in I'm not going
329 * to touch this with a bargepole.
331 static struct sk_buff *ip_glue(struct ipq *qp)
333 struct sk_buff *skb;
334 struct iphdr *iph;
335 struct ipfrag *fp;
336 unsigned char *ptr;
337 int count, len;
339 /* Allocate a new buffer for the datagram. */
340 len = qp->ihlen + qp->len;
342 if(len > 65535)
343 goto out_oversize;
345 skb = dev_alloc_skb(len);
346 if (!skb)
347 goto out_nomem;
349 /* Fill in the basic details. */
350 skb->mac.raw = ptr = skb->data;
351 skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len);
353 /* Copy the original IP headers into the new buffer. */
354 memcpy(ptr, qp->iph, qp->ihlen);
355 ptr += qp->ihlen;
357 /* Copy the data portions of all fragments into the new buffer. */
358 fp = qp->fragments;
359 count = qp->ihlen;
360 while(fp) {
361 if ((fp->len <= 0) || ((count + fp->len) > skb->len))
362 goto out_invalid;
363 memcpy((ptr + fp->offset), fp->ptr, fp->len);
364 if (count == qp->ihlen) {
365 skb->dst = dst_clone(fp->skb->dst);
366 skb->dev = fp->skb->dev;
368 count += fp->len;
369 fp = fp->next;
372 skb->pkt_type = qp->fragments->skb->pkt_type;
373 skb->protocol = qp->fragments->skb->protocol;
375 * Clearly bogus, because security markings of the individual
376 * fragments should have been checked for consistency before
377 * gluing, and intermediate coalescing of fragments may have
378 * taken place in ip_defrag() before ip_glue() ever got called.
379 * If we're not going to do the consistency checking, we might
380 * as well take the value associated with the first fragment.
381 * --rct
383 skb->security = qp->fragments->skb->security;
385 /* Done with all fragments. Fixup the new IP header. */
386 iph = skb->nh.iph;
387 iph->frag_off = 0;
388 iph->tot_len = htons(count);
389 ip_statistics.IpReasmOKs++;
390 return skb;
392 out_invalid:
393 NETDEBUG(printk(KERN_ERR
394 "Invalid fragment list: Fragment over size.\n"));
395 kfree_skb(skb);
396 goto out_fail;
397 out_nomem:
398 NETDEBUG(printk(KERN_ERR
399 "IP: queue_glue: no memory for gluing queue %p\n",
400 qp));
401 goto out_fail;
402 out_oversize:
403 if (net_ratelimit())
404 printk(KERN_INFO
405 "Oversized IP packet from %d.%d.%d.%d.\n",
406 NIPQUAD(qp->iph->saddr));
407 out_fail:
408 ip_statistics.IpReasmFails++;
409 return NULL;
412 /* Process an incoming IP datagram fragment. */
413 struct sk_buff *ip_defrag(struct sk_buff *skb)
415 struct iphdr *iph = skb->nh.iph;
416 struct ipfrag *prev, *next, *tmp, *tfp;
417 struct ipq *qp;
418 unsigned char *ptr;
419 int flags, offset;
420 int i, ihl, end;
422 ip_statistics.IpReasmReqds++;
424 /* Start by cleaning up the memory. */
425 if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
426 ip_evictor();
429 * Look for the entry for this IP datagram in the
430 * "incomplete datagrams" queue. If found, the
431 * timer is removed.
433 qp = ip_find(iph, skb->dst);
435 /* Is this a non-fragmented datagram? */
436 offset = ntohs(iph->frag_off);
437 flags = offset & ~IP_OFFSET;
438 offset &= IP_OFFSET;
440 offset <<= 3; /* offset is in 8-byte chunks */
441 ihl = iph->ihl * 4;
444 * Check whether to create a fresh queue entry. If the
445 * queue already exists, its timer will be restarted as
446 * long as we continue to receive fragments.
448 if (qp) {
449 /* ANK. If the first fragment is received,
450 * we should remember the correct IP header (with options)
452 if (offset == 0) {
453 /* Fragmented frame replaced by unfragmented copy? */
454 if ((flags & IP_MF) == 0)
455 goto out_freequeue;
456 qp->ihlen = ihl;
457 memcpy(qp->iph, iph, (ihl + 8));
459 } else {
460 /* Fragmented frame replaced by unfragmented copy? */
461 if ((offset == 0) && ((flags & IP_MF) == 0))
462 goto out_skb;
464 /* If we failed to create it, then discard the frame. */
465 qp = ip_create(skb, iph);
466 if (!qp)
467 goto out_freeskb;
470 /* Attempt to construct an oversize packet. */
471 if((ntohs(iph->tot_len) + ((int) offset)) > 65535)
472 goto out_oversize;
474 /* Determine the position of this fragment. */
475 end = offset + ntohs(iph->tot_len) - ihl;
477 /* Is this the final fragment? */
478 if ((flags & IP_MF) == 0)
479 qp->len = end;
481 /* Find out which fragments are in front and at the back of us
482 * in the chain of fragments so far. We must know where to put
483 * this fragment, right?
485 prev = NULL;
486 for(next = qp->fragments; next != NULL; next = next->next) {
487 if (next->offset >= offset)
488 break; /* bingo! */
489 prev = next;
492 /* Point into the IP datagram 'data' part. */
493 ptr = skb->data + ihl;
495 /* We found where to put this one. Check for overlap with
496 * preceding fragment, and, if needed, align things so that
497 * any overlaps are eliminated.
499 if ((prev != NULL) && (offset < prev->end)) {
500 i = prev->end - offset;
501 offset += i; /* ptr into datagram */
502 ptr += i; /* ptr into fragment data */
505 /* Look for overlap with succeeding segments.
506 * If we can merge fragments, do it.
508 for (tmp = next; tmp != NULL; tmp = tfp) {
509 tfp = tmp->next;
510 if (tmp->offset >= end)
511 break; /* no overlaps at all */
513 i = end - next->offset; /* overlap is 'i' bytes */
514 tmp->len -= i; /* so reduce size of */
515 tmp->offset += i; /* next fragment */
516 tmp->ptr += i;
518 /* If we get a frag size of <= 0, remove it and the packet
519 * that it goes with.
521 if (tmp->len <= 0) {
522 if (tmp->prev != NULL)
523 tmp->prev->next = tmp->next;
524 else
525 qp->fragments = tmp->next;
527 if (tmp->next != NULL)
528 tmp->next->prev = tmp->prev;
530 /* We have killed the original next frame. */
531 next = tfp;
533 frag_kfree_skb(tmp->skb);
534 frag_kfree_s(tmp, sizeof(struct ipfrag));
539 * Create a fragment to hold this skb.
540 * No memory to save the fragment? throw the lot ...
542 tfp = ip_frag_create(offset, end, skb, ptr);
543 if (!tfp)
544 goto out_freeskb;
546 /* Insert this fragment in the chain of fragments. */
547 tfp->prev = prev;
548 tfp->next = next;
549 if (prev != NULL)
550 prev->next = tfp;
551 else
552 qp->fragments = tfp;
554 if (next != NULL)
555 next->prev = tfp;
557 /* OK, so we inserted this new fragment into the chain.
558 * Check if we now have a full IP datagram which we can
559 * bump up to the IP layer...
561 if (ip_done(qp)) {
562 /* Glue together the fragments. */
563 skb = ip_glue(qp);
564 /* Free the queue entry. */
565 out_freequeue:
566 ip_free(qp);
567 out_skb:
568 return skb;
572 * The queue is still active ... reset its timer.
574 out_timer:
575 mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
576 out:
577 return NULL;
580 * Error exits ... we need to reset the timer if there's a queue.
582 out_oversize:
583 if (net_ratelimit())
584 printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n",
585 NIPQUAD(iph->saddr));
586 /* the skb isn't in a fragment, so fall through to free it */
587 out_freeskb:
588 kfree_skb(skb);
589 ip_statistics.IpReasmFails++;
590 if (qp)
591 goto out_timer;
592 goto out;