2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The IP fragmentation functionality.
8 * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
10 * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox <Alan.Cox@linux.org>
14 * Alan Cox : Split from ip.c , see ip_input.c for history.
15 * David S. Miller : Begin massive cleanup...
16 * Andi Kleen : Add sysctls.
17 * xxxx : Overlapfrag bug.
18 * Ultima : ip_expire() kernel panic.
19 * Bill Hawes : Frag accounting and evictor fixes.
20 * John McDonald : 0 length frag bug.
23 #include <linux/types.h>
25 #include <linux/sched.h>
26 #include <linux/skbuff.h>
28 #include <linux/icmp.h>
29 #include <linux/netdevice.h>
33 #include <linux/tcp.h>
34 #include <linux/udp.h>
35 #include <linux/inet.h>
36 #include <linux/firewall.h>
37 #include <linux/ip_fw.h>
39 /* Fragment cache limits. We will commit 256K at one time. Should we
40 * cross that limit we will prune down to 192K. This should cope with
41 * even the most extreme cases without allowing an attacker to measurably
42 * harm machine performance.
44 int sysctl_ipfrag_high_thresh
= 256*1024;
45 int sysctl_ipfrag_low_thresh
= 192*1024;
47 int sysctl_ipfrag_time
= IP_FRAG_TIME
;
49 /* Describe an IP fragment. */
51 int offset
; /* offset of fragment in IP datagram */
52 int end
; /* last byte of data in datagram */
53 int len
; /* length of this fragment */
54 struct sk_buff
*skb
; /* complete received fragment */
55 unsigned char *ptr
; /* pointer into real fragment data */
56 struct ipfrag
*next
; /* linked list pointers */
60 /* Describe an entry in the "incomplete datagrams" queue. */
62 struct iphdr
*iph
; /* pointer to IP header */
63 struct ipq
*next
; /* linked list pointers */
64 struct ipfrag
*fragments
; /* linked list of received fragments */
65 int len
; /* total length of original datagram */
66 short ihlen
; /* length of the IP header */
67 struct timer_list timer
; /* when will this queue expire? */
69 struct device
*dev
; /* Device - for icmp replies */
74 struct ipq
*ipq_hash
[IPQ_HASHSZ
];
76 #define ipqhashfn(id, saddr, daddr, prot) \
77 ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
79 atomic_t ip_frag_mem
= ATOMIC_INIT(0); /* Memory used for fragments */
81 /* Memory Tracking Functions. */
82 extern __inline__
void frag_kfree_skb(struct sk_buff
*skb
)
84 atomic_sub(skb
->truesize
, &ip_frag_mem
);
88 extern __inline__
void frag_kfree_s(void *ptr
, int len
)
90 atomic_sub(len
, &ip_frag_mem
);
94 extern __inline__
void *frag_kmalloc(int size
, int pri
)
96 void *vp
= kmalloc(size
, pri
);
100 atomic_add(size
, &ip_frag_mem
);
104 /* Create a new fragment entry. */
105 static struct ipfrag
*ip_frag_create(int offset
, int end
,
106 struct sk_buff
*skb
, unsigned char *ptr
)
110 fp
= (struct ipfrag
*) frag_kmalloc(sizeof(struct ipfrag
), GFP_ATOMIC
);
114 /* Fill in the structure. */
117 fp
->len
= end
- offset
;
120 fp
->next
= fp
->prev
= NULL
;
122 /* Charge for the SKB as well. */
123 atomic_add(skb
->truesize
, &ip_frag_mem
);
128 NETDEBUG(printk(KERN_ERR
"IP: frag_create: no memory left !\n"));
132 /* Find the correct entry in the "incomplete datagrams" queue for
133 * this IP datagram, and return the queue entry address if found.
135 static inline struct ipq
*ip_find(struct iphdr
*iph
, struct dst_entry
*dst
)
138 __u32 saddr
= iph
->saddr
;
139 __u32 daddr
= iph
->daddr
;
140 __u8 protocol
= iph
->protocol
;
141 unsigned int hash
= ipqhashfn(id
, saddr
, daddr
, protocol
);
144 /* Always, we are in a BH context, so no locking. -DaveM */
145 for(qp
= ipq_hash
[hash
]; qp
; qp
= qp
->next
) {
146 if(qp
->iph
->id
== id
&&
147 qp
->iph
->saddr
== saddr
&&
148 qp
->iph
->daddr
== daddr
&&
149 qp
->iph
->protocol
== protocol
) {
150 del_timer(&qp
->timer
);
157 /* Remove an entry from the "incomplete datagrams" queue, either
158 * because we completed, reassembled and processed it, or because
161 * This is called _only_ from BH contexts, on packet reception
162 * processing and from frag queue expiration timers. -DaveM
164 static void ip_free(struct ipq
*qp
)
168 /* Stop the timer for this entry. */
169 del_timer(&qp
->timer
);
171 /* Remove this entry from the "incomplete datagrams" queue. */
173 qp
->next
->pprev
= qp
->pprev
;
174 *qp
->pprev
= qp
->next
;
176 /* Release all fragment data. */
179 struct ipfrag
*xp
= fp
->next
;
181 frag_kfree_skb(fp
->skb
);
182 frag_kfree_s(fp
, sizeof(struct ipfrag
));
186 /* Release the IP header. */
187 frag_kfree_s(qp
->iph
, 64 + 8);
189 /* Finally, release the queue descriptor itself. */
190 frag_kfree_s(qp
, sizeof(struct ipq
));
194 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
196 static void ip_expire(unsigned long arg
)
198 struct ipq
*qp
= (struct ipq
*) arg
;
202 #ifdef IP_EXPIRE_DEBUG
203 printk("warning: possible ip-expire attack\n");
208 /* Send an ICMP "Fragment Reassembly Timeout" message. */
209 ip_statistics
.IpReasmTimeout
++;
210 ip_statistics
.IpReasmFails
++;
211 icmp_send(qp
->fragments
->skb
, ICMP_TIME_EXCEEDED
, ICMP_EXC_FRAGTIME
, 0);
214 /* Nuke the fragment queue. */
218 /* Memory limiting on fragments. Evictor trashes the oldest
219 * fragment queue until we are back under the low threshold.
221 static void ip_evictor(void)
227 /* FIXME: Make LRU queue of frag heads. -DaveM */
228 for (i
= 0; i
< IPQ_HASHSZ
; i
++) {
230 if (atomic_read(&ip_frag_mem
) <= sysctl_ipfrag_low_thresh
)
232 /* We are in a BH context, so these queue
233 * accesses are safe. -DaveM
237 /* find the oldest queue for this hash bucket */
246 panic("ip_evictor: memcount");
249 /* Add an entry to the 'ipq' queue for a newly received IP datagram.
250 * We will (hopefully :-) receive all other fragments of this datagram
251 * in time, so we just create a queue for this datagram, in which we
252 * will insert the received fragments at their respective positions.
254 static struct ipq
*ip_create(struct sk_buff
*skb
, struct iphdr
*iph
)
260 qp
= (struct ipq
*) frag_kmalloc(sizeof(struct ipq
), GFP_ATOMIC
);
264 /* Allocate memory for the IP header (plus 8 octets for ICMP). */
265 ihlen
= iph
->ihl
* 4;
267 qp
->iph
= (struct iphdr
*) frag_kmalloc(64 + 8, GFP_ATOMIC
);
271 memcpy(qp
->iph
, iph
, ihlen
+ 8);
274 qp
->fragments
= NULL
;
277 /* Initialize a timer for this entry. */
278 init_timer(&qp
->timer
);
279 qp
->timer
.expires
= 0; /* (to be set later) */
280 qp
->timer
.data
= (unsigned long) qp
; /* pointer to queue */
281 qp
->timer
.function
= ip_expire
; /* expire function */
283 /* Add this entry to the queue. */
284 hash
= ipqhashfn(iph
->id
, iph
->saddr
, iph
->daddr
, iph
->protocol
);
286 /* We are in a BH context, no locking necessary. -DaveM */
287 if((qp
->next
= ipq_hash
[hash
]) != NULL
)
288 qp
->next
->pprev
= &qp
->next
;
290 qp
->pprev
= &ipq_hash
[hash
];
295 frag_kfree_s(qp
, sizeof(struct ipq
));
297 NETDEBUG(printk(KERN_ERR
"IP: create: no memory left !\n"));
301 /* See if a fragment queue is complete. */
302 static int ip_done(struct ipq
*qp
)
307 /* Only possible if we received the final fragment. */
311 /* Check all fragment offsets to see if they connect. */
315 if (fp
->offset
> offset
)
316 return(0); /* fragment(s) missing */
321 /* All fragments are present. */
325 /* Build a new IP datagram from all its fragments.
327 * FIXME: We copy here because we lack an effective way of handling lists
328 * of bits on input. Until the new skb data handling is in I'm not going
329 * to touch this with a bargepole.
331 static struct sk_buff
*ip_glue(struct ipq
*qp
)
339 /* Allocate a new buffer for the datagram. */
340 len
= qp
->ihlen
+ qp
->len
;
345 skb
= dev_alloc_skb(len
);
349 /* Fill in the basic details. */
350 skb
->mac
.raw
= ptr
= skb
->data
;
351 skb
->nh
.iph
= iph
= (struct iphdr
*) skb_put(skb
, len
);
353 /* Copy the original IP headers into the new buffer. */
354 memcpy(ptr
, qp
->iph
, qp
->ihlen
);
357 /* Copy the data portions of all fragments into the new buffer. */
361 if ((fp
->len
<= 0) || ((count
+ fp
->len
) > skb
->len
))
363 memcpy((ptr
+ fp
->offset
), fp
->ptr
, fp
->len
);
364 if (count
== qp
->ihlen
) {
365 skb
->dst
= dst_clone(fp
->skb
->dst
);
366 skb
->dev
= fp
->skb
->dev
;
372 skb
->pkt_type
= qp
->fragments
->skb
->pkt_type
;
373 skb
->protocol
= qp
->fragments
->skb
->protocol
;
375 * Clearly bogus, because security markings of the individual
376 * fragments should have been checked for consistency before
377 * gluing, and intermediate coalescing of fragments may have
378 * taken place in ip_defrag() before ip_glue() ever got called.
379 * If we're not going to do the consistency checking, we might
380 * as well take the value associated with the first fragment.
383 skb
->security
= qp
->fragments
->skb
->security
;
385 /* Done with all fragments. Fixup the new IP header. */
388 iph
->tot_len
= htons(count
);
389 ip_statistics
.IpReasmOKs
++;
393 NETDEBUG(printk(KERN_ERR
394 "Invalid fragment list: Fragment over size.\n"));
398 NETDEBUG(printk(KERN_ERR
399 "IP: queue_glue: no memory for gluing queue %p\n",
405 "Oversized IP packet from %d.%d.%d.%d.\n",
406 NIPQUAD(qp
->iph
->saddr
));
408 ip_statistics
.IpReasmFails
++;
412 /* Process an incoming IP datagram fragment. */
413 struct sk_buff
*ip_defrag(struct sk_buff
*skb
)
415 struct iphdr
*iph
= skb
->nh
.iph
;
416 struct ipfrag
*prev
, *next
, *tmp
, *tfp
;
422 ip_statistics
.IpReasmReqds
++;
424 /* Start by cleaning up the memory. */
425 if (atomic_read(&ip_frag_mem
) > sysctl_ipfrag_high_thresh
)
429 * Look for the entry for this IP datagram in the
430 * "incomplete datagrams" queue. If found, the
433 qp
= ip_find(iph
, skb
->dst
);
435 /* Is this a non-fragmented datagram? */
436 offset
= ntohs(iph
->frag_off
);
437 flags
= offset
& ~IP_OFFSET
;
440 offset
<<= 3; /* offset is in 8-byte chunks */
444 * Check whether to create a fresh queue entry. If the
445 * queue already exists, its timer will be restarted as
446 * long as we continue to receive fragments.
449 /* ANK. If the first fragment is received,
450 * we should remember the correct IP header (with options)
453 /* Fragmented frame replaced by unfragmented copy? */
454 if ((flags
& IP_MF
) == 0)
457 memcpy(qp
->iph
, iph
, (ihl
+ 8));
460 /* Fragmented frame replaced by unfragmented copy? */
461 if ((offset
== 0) && ((flags
& IP_MF
) == 0))
464 /* If we failed to create it, then discard the frame. */
465 qp
= ip_create(skb
, iph
);
470 /* Attempt to construct an oversize packet. */
471 if((ntohs(iph
->tot_len
) + ((int) offset
)) > 65535)
474 /* Determine the position of this fragment. */
475 end
= offset
+ ntohs(iph
->tot_len
) - ihl
;
477 /* Is this the final fragment? */
478 if ((flags
& IP_MF
) == 0)
481 /* Find out which fragments are in front and at the back of us
482 * in the chain of fragments so far. We must know where to put
483 * this fragment, right?
486 for(next
= qp
->fragments
; next
!= NULL
; next
= next
->next
) {
487 if (next
->offset
>= offset
)
492 /* Point into the IP datagram 'data' part. */
493 ptr
= skb
->data
+ ihl
;
495 /* We found where to put this one. Check for overlap with
496 * preceding fragment, and, if needed, align things so that
497 * any overlaps are eliminated.
499 if ((prev
!= NULL
) && (offset
< prev
->end
)) {
500 i
= prev
->end
- offset
;
501 offset
+= i
; /* ptr into datagram */
502 ptr
+= i
; /* ptr into fragment data */
505 /* Look for overlap with succeeding segments.
506 * If we can merge fragments, do it.
508 for (tmp
= next
; tmp
!= NULL
; tmp
= tfp
) {
510 if (tmp
->offset
>= end
)
511 break; /* no overlaps at all */
513 i
= end
- next
->offset
; /* overlap is 'i' bytes */
514 tmp
->len
-= i
; /* so reduce size of */
515 tmp
->offset
+= i
; /* next fragment */
518 /* If we get a frag size of <= 0, remove it and the packet
522 if (tmp
->prev
!= NULL
)
523 tmp
->prev
->next
= tmp
->next
;
525 qp
->fragments
= tmp
->next
;
527 if (tmp
->next
!= NULL
)
528 tmp
->next
->prev
= tmp
->prev
;
530 /* We have killed the original next frame. */
533 frag_kfree_skb(tmp
->skb
);
534 frag_kfree_s(tmp
, sizeof(struct ipfrag
));
539 * Create a fragment to hold this skb.
540 * No memory to save the fragment? throw the lot ...
542 tfp
= ip_frag_create(offset
, end
, skb
, ptr
);
546 /* Insert this fragment in the chain of fragments. */
557 /* OK, so we inserted this new fragment into the chain.
558 * Check if we now have a full IP datagram which we can
559 * bump up to the IP layer...
562 /* Glue together the fragments. */
564 /* Free the queue entry. */
572 * The queue is still active ... reset its timer.
575 mod_timer(&qp
->timer
, jiffies
+ sysctl_ipfrag_time
); /* ~ 30 seconds */
580 * Error exits ... we need to reset the timer if there's a queue.
584 printk(KERN_INFO
"Oversized packet received from %d.%d.%d.%d\n",
585 NIPQUAD(iph
->saddr
));
586 /* the skb isn't in a fragment, so fall through to free it */
589 ip_statistics
.IpReasmFails
++;