2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The IP fragmentation functionality.
8 * Version: $Id: ip_fragment.c,v 1.45 1999/08/30 10:17:10 davem Exp $
10 * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox <Alan.Cox@linux.org>
14 * Alan Cox : Split from ip.c , see ip_input.c for history.
15 * David S. Miller : Begin massive cleanup...
16 * Andi Kleen : Add sysctls.
17 * xxxx : Overlapfrag bug.
18 * Ultima : ip_expire() kernel panic.
19 * Bill Hawes : Frag accounting and evictor fixes.
20 * John McDonald : 0 length frag bug.
23 #include <linux/config.h>
24 #include <linux/types.h>
26 #include <linux/sched.h>
27 #include <linux/skbuff.h>
29 #include <linux/icmp.h>
30 #include <linux/netdevice.h>
34 #include <linux/tcp.h>
35 #include <linux/udp.h>
36 #include <linux/inet.h>
37 #include <linux/netfilter_ipv4.h>
39 /* Fragment cache limits. We will commit 256K at one time. Should we
40 * cross that limit we will prune down to 192K. This should cope with
41 * even the most extreme cases without allowing an attacker to measurably
42 * harm machine performance.
44 int sysctl_ipfrag_high_thresh
= 256*1024;
45 int sysctl_ipfrag_low_thresh
= 192*1024;
47 int sysctl_ipfrag_time
= IP_FRAG_TIME
;
49 /* Describe an IP fragment. */
51 int offset
; /* offset of fragment in IP datagram */
52 int end
; /* last byte of data in datagram */
53 int len
; /* length of this fragment */
54 struct sk_buff
*skb
; /* complete received fragment */
55 unsigned char *ptr
; /* pointer into real fragment data */
56 struct ipfrag
*next
; /* linked list pointers */
60 /* Describe an entry in the "incomplete datagrams" queue. */
62 struct iphdr
*iph
; /* pointer to IP header */
63 struct ipq
*next
; /* linked list pointers */
64 struct ipfrag
*fragments
; /* linked list of received fragments */
65 int len
; /* total length of original datagram */
66 short ihlen
; /* length of the IP header */
67 struct timer_list timer
; /* when will this queue expire? */
69 struct net_device
*dev
; /* Device - for icmp replies */
74 static struct ipq
*ipq_hash
[IPQ_HASHSZ
];
75 static spinlock_t ipfrag_lock
= SPIN_LOCK_UNLOCKED
;
77 #define ipqhashfn(id, saddr, daddr, prot) \
78 ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
80 atomic_t ip_frag_mem
= ATOMIC_INIT(0); /* Memory used for fragments */
82 /* Memory Tracking Functions. */
83 extern __inline__
void frag_kfree_skb(struct sk_buff
*skb
)
85 atomic_sub(skb
->truesize
, &ip_frag_mem
);
89 extern __inline__
void frag_kfree_s(void *ptr
, int len
)
91 atomic_sub(len
, &ip_frag_mem
);
95 extern __inline__
void *frag_kmalloc(int size
, int pri
)
97 void *vp
= kmalloc(size
, pri
);
101 atomic_add(size
, &ip_frag_mem
);
105 /* Create a new fragment entry. */
106 static struct ipfrag
*ip_frag_create(int offset
, int end
,
107 struct sk_buff
*skb
, unsigned char *ptr
)
111 fp
= (struct ipfrag
*) frag_kmalloc(sizeof(struct ipfrag
), GFP_ATOMIC
);
115 /* Fill in the structure. */
118 fp
->len
= end
- offset
;
121 fp
->next
= fp
->prev
= NULL
;
123 /* Charge for the SKB as well. */
124 atomic_add(skb
->truesize
, &ip_frag_mem
);
129 NETDEBUG(printk(KERN_ERR
"IP: frag_create: no memory left !\n"));
133 /* Find the correct entry in the "incomplete datagrams" queue for
134 * this IP datagram, and return the queue entry address if found.
136 static inline struct ipq
*ip_find(struct iphdr
*iph
, struct dst_entry
*dst
)
139 __u32 saddr
= iph
->saddr
;
140 __u32 daddr
= iph
->daddr
;
141 __u8 protocol
= iph
->protocol
;
142 unsigned int hash
= ipqhashfn(id
, saddr
, daddr
, protocol
);
145 /* We are always in BH context, and protected by the
148 for(qp
= ipq_hash
[hash
]; qp
; qp
= qp
->next
) {
149 if(qp
->iph
->id
== id
&&
150 qp
->iph
->saddr
== saddr
&&
151 qp
->iph
->daddr
== daddr
&&
152 qp
->iph
->protocol
== protocol
) {
153 del_timer(&qp
->timer
);
160 /* Remove an entry from the "incomplete datagrams" queue, either
161 * because we completed, reassembled and processed it, or because
164 * This is called _only_ from BH contexts with the ipfrag lock held,
165 * on packet reception processing and from frag queue expiration
168 static void ip_free(struct ipq
*qp
)
172 /* Stop the timer for this entry. */
173 del_timer(&qp
->timer
);
175 /* Remove this entry from the "incomplete datagrams" queue. */
177 qp
->next
->pprev
= qp
->pprev
;
178 *qp
->pprev
= qp
->next
;
180 /* Release all fragment data. */
183 struct ipfrag
*xp
= fp
->next
;
185 frag_kfree_skb(fp
->skb
);
186 frag_kfree_s(fp
, sizeof(struct ipfrag
));
190 /* Release the IP header. */
191 frag_kfree_s(qp
->iph
, 64 + 8);
193 /* Finally, release the queue descriptor itself. */
194 frag_kfree_s(qp
, sizeof(struct ipq
));
198 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
200 static void ip_expire(unsigned long arg
)
202 struct ipq
*qp
= (struct ipq
*) arg
;
204 spin_lock(&ipfrag_lock
);
207 #ifdef IP_EXPIRE_DEBUG
208 printk("warning: possible ip-expire attack\n");
213 /* Send an ICMP "Fragment Reassembly Timeout" message. */
214 ip_statistics
.IpReasmTimeout
++;
215 ip_statistics
.IpReasmFails
++;
216 icmp_send(qp
->fragments
->skb
, ICMP_TIME_EXCEEDED
, ICMP_EXC_FRAGTIME
, 0);
219 /* Nuke the fragment queue. */
221 spin_unlock(&ipfrag_lock
);
224 /* Memory limiting on fragments. Evictor trashes the oldest
225 * fragment queue until we are back under the low threshold.
227 * We are always called in BH with the ipfrag lock held.
229 static void ip_evictor(void)
235 /* FIXME: Make LRU queue of frag heads. -DaveM */
236 for (i
= 0; i
< IPQ_HASHSZ
; i
++) {
238 if (atomic_read(&ip_frag_mem
) <= sysctl_ipfrag_low_thresh
)
242 /* find the oldest queue for this hash bucket */
251 panic("ip_evictor: memcount");
254 /* Add an entry to the 'ipq' queue for a newly received IP datagram.
255 * We will (hopefully :-) receive all other fragments of this datagram
256 * in time, so we just create a queue for this datagram, in which we
257 * will insert the received fragments at their respective positions.
259 static struct ipq
*ip_create(struct sk_buff
*skb
, struct iphdr
*iph
)
265 qp
= (struct ipq
*) frag_kmalloc(sizeof(struct ipq
), GFP_ATOMIC
);
269 /* Allocate memory for the IP header (plus 8 octets for ICMP). */
270 ihlen
= iph
->ihl
* 4;
272 qp
->iph
= (struct iphdr
*) frag_kmalloc(64 + 8, GFP_ATOMIC
);
276 memcpy(qp
->iph
, iph
, ihlen
+ 8);
279 qp
->fragments
= NULL
;
282 /* Initialize a timer for this entry. */
283 init_timer(&qp
->timer
);
284 qp
->timer
.expires
= 0; /* (to be set later) */
285 qp
->timer
.data
= (unsigned long) qp
; /* pointer to queue */
286 qp
->timer
.function
= ip_expire
; /* expire function */
288 /* Add this entry to the queue. */
289 hash
= ipqhashfn(iph
->id
, iph
->saddr
, iph
->daddr
, iph
->protocol
);
291 /* In a BH context and ipfrag lock is held. -DaveM */
292 if((qp
->next
= ipq_hash
[hash
]) != NULL
)
293 qp
->next
->pprev
= &qp
->next
;
295 qp
->pprev
= &ipq_hash
[hash
];
300 frag_kfree_s(qp
, sizeof(struct ipq
));
302 NETDEBUG(printk(KERN_ERR
"IP: create: no memory left !\n"));
306 /* See if a fragment queue is complete. */
307 static int ip_done(struct ipq
*qp
)
312 /* Only possible if we received the final fragment. */
316 /* Check all fragment offsets to see if they connect. */
320 if (fp
->offset
> offset
)
321 return(0); /* fragment(s) missing */
326 /* All fragments are present. */
330 /* Build a new IP datagram from all its fragments.
332 * FIXME: We copy here because we lack an effective way of handling lists
333 * of bits on input. Until the new skb data handling is in I'm not going
334 * to touch this with a bargepole.
336 static struct sk_buff
*ip_glue(struct ipq
*qp
)
344 /* Allocate a new buffer for the datagram. */
345 len
= qp
->ihlen
+ qp
->len
;
350 skb
= dev_alloc_skb(len
);
354 /* Fill in the basic details. */
355 skb
->mac
.raw
= ptr
= skb
->data
;
356 skb
->nh
.iph
= iph
= (struct iphdr
*) skb_put(skb
, len
);
358 /* Copy the original IP headers into the new buffer. */
359 memcpy(ptr
, qp
->iph
, qp
->ihlen
);
362 /* Copy the data portions of all fragments into the new buffer. */
366 if ((fp
->len
<= 0) || ((count
+ fp
->len
) > skb
->len
))
368 memcpy((ptr
+ fp
->offset
), fp
->ptr
, fp
->len
);
369 if (count
== qp
->ihlen
) {
370 skb
->dst
= dst_clone(fp
->skb
->dst
);
371 skb
->dev
= fp
->skb
->dev
;
377 skb
->pkt_type
= qp
->fragments
->skb
->pkt_type
;
378 skb
->protocol
= qp
->fragments
->skb
->protocol
;
380 * Clearly bogus, because security markings of the individual
381 * fragments should have been checked for consistency before
382 * gluing, and intermediate coalescing of fragments may have
383 * taken place in ip_defrag() before ip_glue() ever got called.
384 * If we're not going to do the consistency checking, we might
385 * as well take the value associated with the first fragment.
388 skb
->security
= qp
->fragments
->skb
->security
;
390 #ifdef CONFIG_NETFILTER_DEBUG
391 skb
->nf_debug
= qp
->fragments
->skb
->nf_debug
;
394 /* Done with all fragments. Fixup the new IP header. */
397 iph
->tot_len
= htons(count
);
398 ip_statistics
.IpReasmOKs
++;
402 NETDEBUG(printk(KERN_ERR
403 "Invalid fragment list: Fragment over size.\n"));
407 NETDEBUG(printk(KERN_ERR
408 "IP: queue_glue: no memory for gluing queue %p\n",
414 "Oversized IP packet from %d.%d.%d.%d.\n",
415 NIPQUAD(qp
->iph
->saddr
));
417 ip_statistics
.IpReasmFails
++;
421 /* Process an incoming IP datagram fragment. */
422 struct sk_buff
*ip_defrag(struct sk_buff
*skb
)
424 struct iphdr
*iph
= skb
->nh
.iph
;
425 struct ipfrag
*prev
, *next
, *tmp
, *tfp
;
431 ip_statistics
.IpReasmReqds
++;
433 spin_lock(&ipfrag_lock
);
435 /* Start by cleaning up the memory. */
436 if (atomic_read(&ip_frag_mem
) > sysctl_ipfrag_high_thresh
)
440 * Look for the entry for this IP datagram in the
441 * "incomplete datagrams" queue. If found, the
444 qp
= ip_find(iph
, skb
->dst
);
446 /* Is this a non-fragmented datagram? */
447 offset
= ntohs(iph
->frag_off
);
448 flags
= offset
& ~IP_OFFSET
;
451 offset
<<= 3; /* offset is in 8-byte chunks */
455 * Check whether to create a fresh queue entry. If the
456 * queue already exists, its timer will be restarted as
457 * long as we continue to receive fragments.
460 /* ANK. If the first fragment is received,
461 * we should remember the correct IP header (with options)
464 /* Fragmented frame replaced by unfragmented copy? */
465 if ((flags
& IP_MF
) == 0)
468 memcpy(qp
->iph
, iph
, (ihl
+ 8));
471 /* Fragmented frame replaced by unfragmented copy? */
472 if ((offset
== 0) && ((flags
& IP_MF
) == 0))
475 /* If we failed to create it, then discard the frame. */
476 qp
= ip_create(skb
, iph
);
481 /* Attempt to construct an oversize packet. */
482 if((ntohs(iph
->tot_len
) + ((int) offset
)) > 65535)
485 /* Determine the position of this fragment. */
486 end
= offset
+ ntohs(iph
->tot_len
) - ihl
;
488 /* Is this the final fragment? */
489 if ((flags
& IP_MF
) == 0)
492 /* Find out which fragments are in front and at the back of us
493 * in the chain of fragments so far. We must know where to put
494 * this fragment, right?
497 for(next
= qp
->fragments
; next
!= NULL
; next
= next
->next
) {
498 if (next
->offset
>= offset
)
503 /* Point into the IP datagram 'data' part. */
504 ptr
= skb
->data
+ ihl
;
506 /* We found where to put this one. Check for overlap with
507 * preceding fragment, and, if needed, align things so that
508 * any overlaps are eliminated.
510 if ((prev
!= NULL
) && (offset
< prev
->end
)) {
511 i
= prev
->end
- offset
;
512 offset
+= i
; /* ptr into datagram */
513 ptr
+= i
; /* ptr into fragment data */
516 /* Look for overlap with succeeding segments.
517 * If we can merge fragments, do it.
519 for (tmp
= next
; tmp
!= NULL
; tmp
= tfp
) {
521 if (tmp
->offset
>= end
)
522 break; /* no overlaps at all */
524 i
= end
- next
->offset
; /* overlap is 'i' bytes */
525 tmp
->len
-= i
; /* so reduce size of */
526 tmp
->offset
+= i
; /* next fragment */
529 /* If we get a frag size of <= 0, remove it and the packet
533 if (tmp
->prev
!= NULL
)
534 tmp
->prev
->next
= tmp
->next
;
536 qp
->fragments
= tmp
->next
;
538 if (tmp
->next
!= NULL
)
539 tmp
->next
->prev
= tmp
->prev
;
541 /* We have killed the original next frame. */
544 frag_kfree_skb(tmp
->skb
);
545 frag_kfree_s(tmp
, sizeof(struct ipfrag
));
550 * Create a fragment to hold this skb.
551 * No memory to save the fragment? throw the lot ...
553 tfp
= ip_frag_create(offset
, end
, skb
, ptr
);
557 /* Insert this fragment in the chain of fragments. */
568 /* OK, so we inserted this new fragment into the chain.
569 * Check if we now have a full IP datagram which we can
570 * bump up to the IP layer...
573 /* Glue together the fragments. */
575 /* Free the queue entry. */
579 spin_unlock(&ipfrag_lock
);
584 * The queue is still active ... reset its timer.
587 mod_timer(&qp
->timer
, jiffies
+ sysctl_ipfrag_time
); /* ~ 30 seconds */
589 spin_unlock(&ipfrag_lock
);
593 * Error exits ... we need to reset the timer if there's a queue.
597 printk(KERN_INFO
"Oversized packet received from %d.%d.%d.%d\n",
598 NIPQUAD(iph
->saddr
));
599 /* the skb isn't in a fragment, so fall through to free it */
602 ip_statistics
.IpReasmFails
++;