aoe: improve retransmission heuristics
[linux-2.6.22.y-op.git] / drivers / block / aoe / aoecmd.c
blobc0bdc1fe21f052f0e89538cfe9f2a8d5955ac2ee
1 /* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
2 /*
3 * aoecmd.c
4 * Filesystem request handling methods
5 */
7 #include <linux/hdreg.h>
8 #include <linux/blkdev.h>
9 #include <linux/skbuff.h>
10 #include <linux/netdevice.h>
11 #include <linux/genhd.h>
12 #include <asm/unaligned.h>
13 #include "aoe.h"
15 #define TIMERTICK (HZ / 10)
16 #define MINTIMER (2 * TIMERTICK)
17 #define MAXTIMER (HZ << 1)
18 #define MAXWAIT (60 * 3) /* After MAXWAIT seconds, give up and fail dev */
20 struct sk_buff *
21 new_skb(ulong len)
23 struct sk_buff *skb;
25 skb = alloc_skb(len, GFP_ATOMIC);
26 if (skb) {
27 skb->nh.raw = skb->mac.raw = skb->data;
28 skb->protocol = __constant_htons(ETH_P_AOE);
29 skb->priority = 0;
30 skb_put(skb, len);
31 memset(skb->head, 0, len);
32 skb->next = skb->prev = NULL;
34 /* tell the network layer not to perform IP checksums
35 * or to get the NIC to do it
37 skb->ip_summed = CHECKSUM_NONE;
39 return skb;
42 static struct frame *
43 getframe(struct aoedev *d, int tag)
45 struct frame *f, *e;
47 f = d->frames;
48 e = f + d->nframes;
49 for (; f<e; f++)
50 if (f->tag == tag)
51 return f;
52 return NULL;
56 * Leave the top bit clear so we have tagspace for userland.
57 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
58 * This driver reserves tag -1 to mean "unused frame."
60 static int
61 newtag(struct aoedev *d)
63 register ulong n;
65 n = jiffies & 0xffff;
66 return n |= (++d->lasttag & 0x7fff) << 16;
69 static int
70 aoehdr_atainit(struct aoedev *d, struct aoe_hdr *h)
72 u32 host_tag = newtag(d);
74 memcpy(h->src, d->ifp->dev_addr, sizeof h->src);
75 memcpy(h->dst, d->addr, sizeof h->dst);
76 h->type = __constant_cpu_to_be16(ETH_P_AOE);
77 h->verfl = AOE_HVER;
78 h->major = cpu_to_be16(d->aoemajor);
79 h->minor = d->aoeminor;
80 h->cmd = AOECMD_ATA;
81 h->tag = cpu_to_be32(host_tag);
83 return host_tag;
86 static inline void
87 put_lba(struct aoe_atahdr *ah, sector_t lba)
89 ah->lba0 = lba;
90 ah->lba1 = lba >>= 8;
91 ah->lba2 = lba >>= 8;
92 ah->lba3 = lba >>= 8;
93 ah->lba4 = lba >>= 8;
94 ah->lba5 = lba >>= 8;
97 static void
98 aoecmd_ata_rw(struct aoedev *d, struct frame *f)
100 struct aoe_hdr *h;
101 struct aoe_atahdr *ah;
102 struct buf *buf;
103 struct sk_buff *skb;
104 ulong bcnt;
105 register sector_t sector;
106 char writebit, extbit;
108 writebit = 0x10;
109 extbit = 0x4;
111 buf = d->inprocess;
113 sector = buf->sector;
114 bcnt = buf->bv_resid;
115 if (bcnt > d->maxbcnt)
116 bcnt = d->maxbcnt;
118 /* initialize the headers & frame */
119 skb = f->skb;
120 h = (struct aoe_hdr *) skb->mac.raw;
121 ah = (struct aoe_atahdr *) (h+1);
122 skb->len = sizeof *h + sizeof *ah;
123 memset(h, 0, skb->len);
124 f->tag = aoehdr_atainit(d, h);
125 f->waited = 0;
126 f->buf = buf;
127 f->bufaddr = buf->bufaddr;
128 f->bcnt = bcnt;
129 f->lba = sector;
131 /* set up ata header */
132 ah->scnt = bcnt >> 9;
133 put_lba(ah, sector);
134 if (d->flags & DEVFL_EXT) {
135 ah->aflags |= AOEAFL_EXT;
136 } else {
137 extbit = 0;
138 ah->lba3 &= 0x0f;
139 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
142 if (bio_data_dir(buf->bio) == WRITE) {
143 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
144 offset_in_page(f->bufaddr), bcnt);
145 ah->aflags |= AOEAFL_WRITE;
146 } else {
147 skb_shinfo(skb)->nr_frags = 0;
148 skb->len = ETH_ZLEN;
149 writebit = 0;
152 ah->cmdstat = WIN_READ | writebit | extbit;
154 /* mark all tracking fields and load out */
155 buf->nframesout += 1;
156 buf->bufaddr += bcnt;
157 buf->bv_resid -= bcnt;
158 /* dprintk("bv_resid=%ld\n", buf->bv_resid); */
159 buf->resid -= bcnt;
160 buf->sector += bcnt >> 9;
161 if (buf->resid == 0) {
162 d->inprocess = NULL;
163 } else if (buf->bv_resid == 0) {
164 buf->bv++;
165 buf->bv_resid = buf->bv->bv_len;
166 buf->bufaddr = page_address(buf->bv->bv_page) + buf->bv->bv_offset;
169 skb->dev = d->ifp;
170 skb_get(skb);
171 skb->next = NULL;
172 if (d->sendq_hd)
173 d->sendq_tl->next = skb;
174 else
175 d->sendq_hd = skb;
176 d->sendq_tl = skb;
179 /* some callers cannot sleep, and they can call this function,
180 * transmitting the packets later, when interrupts are on
182 static struct sk_buff *
183 aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff **tail)
185 struct aoe_hdr *h;
186 struct aoe_cfghdr *ch;
187 struct sk_buff *skb, *sl, *sl_tail;
188 struct net_device *ifp;
190 sl = sl_tail = NULL;
192 read_lock(&dev_base_lock);
193 for (ifp = dev_base; ifp; dev_put(ifp), ifp = ifp->next) {
194 dev_hold(ifp);
195 if (!is_aoe_netif(ifp))
196 continue;
198 skb = new_skb(sizeof *h + sizeof *ch);
199 if (skb == NULL) {
200 iprintk("skb alloc failure\n");
201 continue;
203 skb->dev = ifp;
204 if (sl_tail == NULL)
205 sl_tail = skb;
206 h = (struct aoe_hdr *) skb->mac.raw;
207 memset(h, 0, sizeof *h + sizeof *ch);
209 memset(h->dst, 0xff, sizeof h->dst);
210 memcpy(h->src, ifp->dev_addr, sizeof h->src);
211 h->type = __constant_cpu_to_be16(ETH_P_AOE);
212 h->verfl = AOE_HVER;
213 h->major = cpu_to_be16(aoemajor);
214 h->minor = aoeminor;
215 h->cmd = AOECMD_CFG;
217 skb->next = sl;
218 sl = skb;
220 read_unlock(&dev_base_lock);
222 if (tail != NULL)
223 *tail = sl_tail;
224 return sl;
227 /* enters with d->lock held */
228 void
229 aoecmd_work(struct aoedev *d)
231 struct frame *f;
232 struct buf *buf;
234 if (d->flags & DEVFL_PAUSE) {
235 if (!aoedev_isbusy(d))
236 d->sendq_hd = aoecmd_cfg_pkts(d->aoemajor,
237 d->aoeminor, &d->sendq_tl);
238 return;
241 loop:
242 f = getframe(d, FREETAG);
243 if (f == NULL)
244 return;
245 if (d->inprocess == NULL) {
246 if (list_empty(&d->bufq))
247 return;
248 buf = container_of(d->bufq.next, struct buf, bufs);
249 list_del(d->bufq.next);
250 /*dprintk("bi_size=%ld\n", buf->bio->bi_size); */
251 d->inprocess = buf;
253 aoecmd_ata_rw(d, f);
254 goto loop;
257 static void
258 rexmit(struct aoedev *d, struct frame *f)
260 struct sk_buff *skb;
261 struct aoe_hdr *h;
262 struct aoe_atahdr *ah;
263 char buf[128];
264 u32 n;
266 n = newtag(d);
268 snprintf(buf, sizeof buf,
269 "%15s e%ld.%ld oldtag=%08x@%08lx newtag=%08x\n",
270 "retransmit",
271 d->aoemajor, d->aoeminor, f->tag, jiffies, n);
272 aoechr_error(buf);
274 skb = f->skb;
275 h = (struct aoe_hdr *) skb->mac.raw;
276 ah = (struct aoe_atahdr *) (h+1);
277 f->tag = n;
278 h->tag = cpu_to_be32(n);
279 memcpy(h->dst, d->addr, sizeof h->dst);
280 memcpy(h->src, d->ifp->dev_addr, sizeof h->src);
282 n = DEFAULTBCNT / 512;
283 if (ah->scnt > n) {
284 ah->scnt = n;
285 if (ah->aflags & AOEAFL_WRITE)
286 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
287 offset_in_page(f->bufaddr), DEFAULTBCNT);
288 if (++d->lostjumbo > (d->nframes << 1))
289 if (d->maxbcnt != DEFAULTBCNT) {
290 iprintk("too many lost jumbo - using 1KB frames.\n");
291 d->maxbcnt = DEFAULTBCNT;
292 d->flags |= DEVFL_MAXBCNT;
296 skb->dev = d->ifp;
297 skb_get(skb);
298 skb->next = NULL;
299 if (d->sendq_hd)
300 d->sendq_tl->next = skb;
301 else
302 d->sendq_hd = skb;
303 d->sendq_tl = skb;
306 static int
307 tsince(int tag)
309 int n;
311 n = jiffies & 0xffff;
312 n -= tag & 0xffff;
313 if (n < 0)
314 n += 1<<16;
315 return n;
318 static void
319 rexmit_timer(ulong vp)
321 struct aoedev *d;
322 struct frame *f, *e;
323 struct sk_buff *sl;
324 register long timeout;
325 ulong flags, n;
327 d = (struct aoedev *) vp;
328 sl = NULL;
330 /* timeout is always ~150% of the moving average */
331 timeout = d->rttavg;
332 timeout += timeout >> 1;
334 spin_lock_irqsave(&d->lock, flags);
336 if (d->flags & DEVFL_TKILL) {
337 spin_unlock_irqrestore(&d->lock, flags);
338 return;
340 f = d->frames;
341 e = f + d->nframes;
342 for (; f<e; f++) {
343 if (f->tag != FREETAG && tsince(f->tag) >= timeout) {
344 n = f->waited += timeout;
345 n /= HZ;
346 if (n > MAXWAIT) { /* waited too long. device failure. */
347 aoedev_downdev(d);
348 break;
350 rexmit(d, f);
354 sl = d->sendq_hd;
355 d->sendq_hd = d->sendq_tl = NULL;
356 if (sl) {
357 n = d->rttavg <<= 1;
358 if (n > MAXTIMER)
359 d->rttavg = MAXTIMER;
362 d->timer.expires = jiffies + TIMERTICK;
363 add_timer(&d->timer);
365 spin_unlock_irqrestore(&d->lock, flags);
367 aoenet_xmit(sl);
370 /* this function performs work that has been deferred until sleeping is OK
372 void
373 aoecmd_sleepwork(void *vp)
375 struct aoedev *d = (struct aoedev *) vp;
377 if (d->flags & DEVFL_GDALLOC)
378 aoeblk_gdalloc(d);
380 if (d->flags & DEVFL_NEWSIZE) {
381 struct block_device *bd;
382 unsigned long flags;
383 u64 ssize;
385 ssize = d->gd->capacity;
386 bd = bdget_disk(d->gd, 0);
388 if (bd) {
389 mutex_lock(&bd->bd_inode->i_mutex);
390 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
391 mutex_unlock(&bd->bd_inode->i_mutex);
392 bdput(bd);
394 spin_lock_irqsave(&d->lock, flags);
395 d->flags |= DEVFL_UP;
396 d->flags &= ~DEVFL_NEWSIZE;
397 spin_unlock_irqrestore(&d->lock, flags);
401 static void
402 ataid_complete(struct aoedev *d, unsigned char *id)
404 u64 ssize;
405 u16 n;
407 /* word 83: command set supported */
408 n = le16_to_cpu(get_unaligned((__le16 *) &id[83<<1]));
410 /* word 86: command set/feature enabled */
411 n |= le16_to_cpu(get_unaligned((__le16 *) &id[86<<1]));
413 if (n & (1<<10)) { /* bit 10: LBA 48 */
414 d->flags |= DEVFL_EXT;
416 /* word 100: number lba48 sectors */
417 ssize = le64_to_cpu(get_unaligned((__le64 *) &id[100<<1]));
419 /* set as in ide-disk.c:init_idedisk_capacity */
420 d->geo.cylinders = ssize;
421 d->geo.cylinders /= (255 * 63);
422 d->geo.heads = 255;
423 d->geo.sectors = 63;
424 } else {
425 d->flags &= ~DEVFL_EXT;
427 /* number lba28 sectors */
428 ssize = le32_to_cpu(get_unaligned((__le32 *) &id[60<<1]));
430 /* NOTE: obsolete in ATA 6 */
431 d->geo.cylinders = le16_to_cpu(get_unaligned((__le16 *) &id[54<<1]));
432 d->geo.heads = le16_to_cpu(get_unaligned((__le16 *) &id[55<<1]));
433 d->geo.sectors = le16_to_cpu(get_unaligned((__le16 *) &id[56<<1]));
436 if (d->ssize != ssize)
437 iprintk("%012llx e%lu.%lu v%04x has %llu sectors\n",
438 (unsigned long long)mac_addr(d->addr),
439 d->aoemajor, d->aoeminor,
440 d->fw_ver, (long long)ssize);
441 d->ssize = ssize;
442 d->geo.start = 0;
443 if (d->gd != NULL) {
444 d->gd->capacity = ssize;
445 d->flags |= DEVFL_NEWSIZE;
446 } else {
447 if (d->flags & DEVFL_GDALLOC) {
448 eprintk("can't schedule work for e%lu.%lu, %s\n",
449 d->aoemajor, d->aoeminor,
450 "it's already on! This shouldn't happen.\n");
451 return;
453 d->flags |= DEVFL_GDALLOC;
455 schedule_work(&d->work);
458 static void
459 calc_rttavg(struct aoedev *d, int rtt)
461 register long n;
463 n = rtt;
464 if (n < 0) {
465 n = -rtt;
466 if (n < MINTIMER)
467 n = MINTIMER;
468 else if (n > MAXTIMER)
469 n = MAXTIMER;
470 d->mintimer += (n - d->mintimer) >> 1;
471 } else if (n < d->mintimer)
472 n = d->mintimer;
473 else if (n > MAXTIMER)
474 n = MAXTIMER;
476 /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
477 n -= d->rttavg;
478 d->rttavg += n >> 2;
481 void
482 aoecmd_ata_rsp(struct sk_buff *skb)
484 struct aoedev *d;
485 struct aoe_hdr *hin, *hout;
486 struct aoe_atahdr *ahin, *ahout;
487 struct frame *f;
488 struct buf *buf;
489 struct sk_buff *sl;
490 register long n;
491 ulong flags;
492 char ebuf[128];
493 u16 aoemajor;
495 hin = (struct aoe_hdr *) skb->mac.raw;
496 aoemajor = be16_to_cpu(hin->major);
497 d = aoedev_by_aoeaddr(aoemajor, hin->minor);
498 if (d == NULL) {
499 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
500 "for unknown device %d.%d\n",
501 aoemajor, hin->minor);
502 aoechr_error(ebuf);
503 return;
506 spin_lock_irqsave(&d->lock, flags);
508 n = be32_to_cpu(hin->tag);
509 f = getframe(d, n);
510 if (f == NULL) {
511 calc_rttavg(d, -tsince(n));
512 spin_unlock_irqrestore(&d->lock, flags);
513 snprintf(ebuf, sizeof ebuf,
514 "%15s e%d.%d tag=%08x@%08lx\n",
515 "unexpected rsp",
516 be16_to_cpu(hin->major),
517 hin->minor,
518 be32_to_cpu(hin->tag),
519 jiffies);
520 aoechr_error(ebuf);
521 return;
524 calc_rttavg(d, tsince(f->tag));
526 ahin = (struct aoe_atahdr *) (hin+1);
527 hout = (struct aoe_hdr *) f->skb->mac.raw;
528 ahout = (struct aoe_atahdr *) (hout+1);
529 buf = f->buf;
531 if (ahout->cmdstat == WIN_IDENTIFY)
532 d->flags &= ~DEVFL_PAUSE;
533 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
534 eprintk("ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%ld\n",
535 ahout->cmdstat, ahin->cmdstat,
536 d->aoemajor, d->aoeminor);
537 if (buf)
538 buf->flags |= BUFFL_FAIL;
539 } else {
540 n = ahout->scnt << 9;
541 switch (ahout->cmdstat) {
542 case WIN_READ:
543 case WIN_READ_EXT:
544 if (skb->len - sizeof *hin - sizeof *ahin < n) {
545 eprintk("runt data size in read. skb->len=%d\n",
546 skb->len);
547 /* fail frame f? just returning will rexmit. */
548 spin_unlock_irqrestore(&d->lock, flags);
549 return;
551 memcpy(f->bufaddr, ahin+1, n);
552 case WIN_WRITE:
553 case WIN_WRITE_EXT:
554 if (f->bcnt -= n) {
555 f->bufaddr += n;
556 put_lba(ahout, f->lba += ahout->scnt);
557 n = f->bcnt;
558 if (n > DEFAULTBCNT)
559 n = DEFAULTBCNT;
560 ahout->scnt = n >> 9;
561 if (ahout->aflags & AOEAFL_WRITE)
562 skb_fill_page_desc(f->skb, 0,
563 virt_to_page(f->bufaddr),
564 offset_in_page(f->bufaddr), n);
565 f->tag = newtag(d);
566 hout->tag = cpu_to_be32(f->tag);
567 skb->dev = d->ifp;
568 skb_get(f->skb);
569 f->skb->next = NULL;
570 spin_unlock_irqrestore(&d->lock, flags);
571 aoenet_xmit(f->skb);
572 return;
574 if (n > DEFAULTBCNT)
575 d->lostjumbo = 0;
576 break;
577 case WIN_IDENTIFY:
578 if (skb->len - sizeof *hin - sizeof *ahin < 512) {
579 iprintk("runt data size in ataid. skb->len=%d\n",
580 skb->len);
581 spin_unlock_irqrestore(&d->lock, flags);
582 return;
584 ataid_complete(d, (char *) (ahin+1));
585 break;
586 default:
587 iprintk("unrecognized ata command %2.2Xh for %d.%d\n",
588 ahout->cmdstat,
589 be16_to_cpu(hin->major),
590 hin->minor);
594 if (buf) {
595 buf->nframesout -= 1;
596 if (buf->nframesout == 0 && buf->resid == 0) {
597 unsigned long duration = jiffies - buf->start_time;
598 unsigned long n_sect = buf->bio->bi_size >> 9;
599 struct gendisk *disk = d->gd;
600 const int rw = bio_data_dir(buf->bio);
602 disk_stat_inc(disk, ios[rw]);
603 disk_stat_add(disk, ticks[rw], duration);
604 disk_stat_add(disk, sectors[rw], n_sect);
605 disk_stat_add(disk, io_ticks, duration);
606 n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
607 bio_endio(buf->bio, buf->bio->bi_size, n);
608 mempool_free(buf, d->bufpool);
612 f->buf = NULL;
613 f->tag = FREETAG;
615 aoecmd_work(d);
616 sl = d->sendq_hd;
617 d->sendq_hd = d->sendq_tl = NULL;
619 spin_unlock_irqrestore(&d->lock, flags);
620 aoenet_xmit(sl);
623 void
624 aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
626 struct sk_buff *sl;
628 sl = aoecmd_cfg_pkts(aoemajor, aoeminor, NULL);
630 aoenet_xmit(sl);
634 * Since we only call this in one place (and it only prepares one frame)
635 * we just return the skb. Usually we'd chain it up to the aoedev sendq.
637 static struct sk_buff *
638 aoecmd_ata_id(struct aoedev *d)
640 struct aoe_hdr *h;
641 struct aoe_atahdr *ah;
642 struct frame *f;
643 struct sk_buff *skb;
645 f = getframe(d, FREETAG);
646 if (f == NULL) {
647 eprintk("can't get a frame. This shouldn't happen.\n");
648 return NULL;
651 /* initialize the headers & frame */
652 skb = f->skb;
653 h = (struct aoe_hdr *) skb->mac.raw;
654 ah = (struct aoe_atahdr *) (h+1);
655 skb->len = sizeof *h + sizeof *ah;
656 memset(h, 0, skb->len);
657 f->tag = aoehdr_atainit(d, h);
658 f->waited = 0;
660 /* set up ata header */
661 ah->scnt = 1;
662 ah->cmdstat = WIN_IDENTIFY;
663 ah->lba3 = 0xa0;
665 skb->dev = d->ifp;
666 skb_get(skb);
668 d->rttavg = MAXTIMER;
669 d->timer.function = rexmit_timer;
671 return skb;
674 void
675 aoecmd_cfg_rsp(struct sk_buff *skb)
677 struct aoedev *d;
678 struct aoe_hdr *h;
679 struct aoe_cfghdr *ch;
680 ulong flags, sysminor, aoemajor;
681 struct sk_buff *sl;
682 enum { MAXFRAMES = 16 };
683 u16 n;
685 h = (struct aoe_hdr *) skb->mac.raw;
686 ch = (struct aoe_cfghdr *) (h+1);
689 * Enough people have their dip switches set backwards to
690 * warrant a loud message for this special case.
692 aoemajor = be16_to_cpu(h->major);
693 if (aoemajor == 0xfff) {
694 eprintk("Warning: shelf address is all ones. "
695 "Check shelf dip switches.\n");
696 return;
699 sysminor = SYSMINOR(aoemajor, h->minor);
700 if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
701 iprintk("e%ld.%d: minor number too large\n",
702 aoemajor, (int) h->minor);
703 return;
706 n = be16_to_cpu(ch->bufcnt);
707 if (n > MAXFRAMES) /* keep it reasonable */
708 n = MAXFRAMES;
710 d = aoedev_by_sysminor_m(sysminor, n);
711 if (d == NULL) {
712 iprintk("device sysminor_m failure\n");
713 return;
716 spin_lock_irqsave(&d->lock, flags);
718 /* permit device to migrate mac and network interface */
719 d->ifp = skb->dev;
720 memcpy(d->addr, h->src, sizeof d->addr);
721 if (!(d->flags & DEVFL_MAXBCNT)) {
722 n = d->ifp->mtu;
723 n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
724 n /= 512;
725 if (n > ch->scnt)
726 n = ch->scnt;
727 d->maxbcnt = n ? n * 512 : DEFAULTBCNT;
730 /* don't change users' perspective */
731 if (d->nopen && !(d->flags & DEVFL_PAUSE)) {
732 spin_unlock_irqrestore(&d->lock, flags);
733 return;
735 d->flags |= DEVFL_PAUSE; /* force pause */
736 d->mintimer = MINTIMER;
737 d->fw_ver = be16_to_cpu(ch->fwver);
739 /* check for already outstanding ataid */
740 sl = aoedev_isbusy(d) == 0 ? aoecmd_ata_id(d) : NULL;
742 spin_unlock_irqrestore(&d->lock, flags);
744 aoenet_xmit(sl);