aoe: use high-resolution RTTs with fallback to low-res
[linux-2.6.git] / drivers / block / aoe / aoecmd.c
bloba99220ad62628b117ed9f089a37034afdb479143
1 /* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
2 /*
3 * aoecmd.c
4 * Filesystem request handling methods
5 */
7 #include <linux/ata.h>
8 #include <linux/slab.h>
9 #include <linux/hdreg.h>
10 #include <linux/blkdev.h>
11 #include <linux/skbuff.h>
12 #include <linux/netdevice.h>
13 #include <linux/genhd.h>
14 #include <linux/moduleparam.h>
15 #include <linux/workqueue.h>
16 #include <linux/kthread.h>
17 #include <net/net_namespace.h>
18 #include <asm/unaligned.h>
19 #include <linux/uio.h>
20 #include "aoe.h"
22 #define MAXIOC (8192) /* default meant to avoid most soft lockups */
24 static void ktcomplete(struct frame *, struct sk_buff *);
26 static struct buf *nextbuf(struct aoedev *);
28 static int aoe_deadsecs = 60 * 3;
29 module_param(aoe_deadsecs, int, 0644);
30 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
32 static int aoe_maxout = 16;
33 module_param(aoe_maxout, int, 0644);
34 MODULE_PARM_DESC(aoe_maxout,
35 "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
37 static wait_queue_head_t ktiowq;
38 static struct ktstate kts;
40 /* io completion queue */
41 static struct {
42 struct list_head head;
43 spinlock_t lock;
44 } iocq;
46 static struct sk_buff *
47 new_skb(ulong len)
49 struct sk_buff *skb;
51 skb = alloc_skb(len, GFP_ATOMIC);
52 if (skb) {
53 skb_reset_mac_header(skb);
54 skb_reset_network_header(skb);
55 skb->protocol = __constant_htons(ETH_P_AOE);
56 skb_checksum_none_assert(skb);
58 return skb;
61 static struct frame *
62 getframe_deferred(struct aoedev *d, u32 tag)
64 struct list_head *head, *pos, *nx;
65 struct frame *f;
67 head = &d->rexmitq;
68 list_for_each_safe(pos, nx, head) {
69 f = list_entry(pos, struct frame, head);
70 if (f->tag == tag) {
71 list_del(pos);
72 return f;
75 return NULL;
78 static struct frame *
79 getframe(struct aoedev *d, u32 tag)
81 struct frame *f;
82 struct list_head *head, *pos, *nx;
83 u32 n;
85 n = tag % NFACTIVE;
86 head = &d->factive[n];
87 list_for_each_safe(pos, nx, head) {
88 f = list_entry(pos, struct frame, head);
89 if (f->tag == tag) {
90 list_del(pos);
91 return f;
94 return NULL;
98 * Leave the top bit clear so we have tagspace for userland.
99 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
100 * This driver reserves tag -1 to mean "unused frame."
102 static int
103 newtag(struct aoedev *d)
105 register ulong n;
107 n = jiffies & 0xffff;
108 return n |= (++d->lasttag & 0x7fff) << 16;
111 static u32
112 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
114 u32 host_tag = newtag(d);
116 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
117 memcpy(h->dst, t->addr, sizeof h->dst);
118 h->type = __constant_cpu_to_be16(ETH_P_AOE);
119 h->verfl = AOE_HVER;
120 h->major = cpu_to_be16(d->aoemajor);
121 h->minor = d->aoeminor;
122 h->cmd = AOECMD_ATA;
123 h->tag = cpu_to_be32(host_tag);
125 return host_tag;
128 static inline void
129 put_lba(struct aoe_atahdr *ah, sector_t lba)
131 ah->lba0 = lba;
132 ah->lba1 = lba >>= 8;
133 ah->lba2 = lba >>= 8;
134 ah->lba3 = lba >>= 8;
135 ah->lba4 = lba >>= 8;
136 ah->lba5 = lba >>= 8;
139 static struct aoeif *
140 ifrotate(struct aoetgt *t)
142 struct aoeif *ifp;
144 ifp = t->ifp;
145 ifp++;
146 if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
147 ifp = t->ifs;
148 if (ifp->nd == NULL)
149 return NULL;
150 return t->ifp = ifp;
153 static void
154 skb_pool_put(struct aoedev *d, struct sk_buff *skb)
156 __skb_queue_tail(&d->skbpool, skb);
159 static struct sk_buff *
160 skb_pool_get(struct aoedev *d)
162 struct sk_buff *skb = skb_peek(&d->skbpool);
164 if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
165 __skb_unlink(skb, &d->skbpool);
166 return skb;
168 if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
169 (skb = new_skb(ETH_ZLEN)))
170 return skb;
172 return NULL;
175 void
176 aoe_freetframe(struct frame *f)
178 struct aoetgt *t;
180 t = f->t;
181 f->buf = NULL;
182 f->bv = NULL;
183 f->r_skb = NULL;
184 list_add(&f->head, &t->ffree);
187 static struct frame *
188 newtframe(struct aoedev *d, struct aoetgt *t)
190 struct frame *f;
191 struct sk_buff *skb;
192 struct list_head *pos;
194 if (list_empty(&t->ffree)) {
195 if (t->falloc >= NSKBPOOLMAX*2)
196 return NULL;
197 f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
198 if (f == NULL)
199 return NULL;
200 t->falloc++;
201 f->t = t;
202 } else {
203 pos = t->ffree.next;
204 list_del(pos);
205 f = list_entry(pos, struct frame, head);
208 skb = f->skb;
209 if (skb == NULL) {
210 f->skb = skb = new_skb(ETH_ZLEN);
211 if (!skb) {
212 bail: aoe_freetframe(f);
213 return NULL;
217 if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
218 skb = skb_pool_get(d);
219 if (skb == NULL)
220 goto bail;
221 skb_pool_put(d, f->skb);
222 f->skb = skb;
225 skb->truesize -= skb->data_len;
226 skb_shinfo(skb)->nr_frags = skb->data_len = 0;
227 skb_trim(skb, 0);
228 return f;
231 static struct frame *
232 newframe(struct aoedev *d)
234 struct frame *f;
235 struct aoetgt *t, **tt;
236 int totout = 0;
238 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
239 printk(KERN_ERR "aoe: NULL TARGETS!\n");
240 return NULL;
242 tt = d->tgt; /* last used target */
243 for (;;) {
244 tt++;
245 if (tt >= &d->targets[NTARGETS] || !*tt)
246 tt = d->targets;
247 t = *tt;
248 totout += t->nout;
249 if (t->nout < t->maxout
250 && t != d->htgt
251 && t->ifp->nd) {
252 f = newtframe(d, t);
253 if (f) {
254 ifrotate(t);
255 d->tgt = tt;
256 return f;
259 if (tt == d->tgt) /* we've looped and found nada */
260 break;
262 if (totout == 0) {
263 d->kicked++;
264 d->flags |= DEVFL_KICKME;
266 return NULL;
269 static void
270 skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
272 int frag = 0;
273 ulong fcnt;
274 loop:
275 fcnt = bv->bv_len - (off - bv->bv_offset);
276 if (fcnt > cnt)
277 fcnt = cnt;
278 skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
279 cnt -= fcnt;
280 if (cnt <= 0)
281 return;
282 bv++;
283 off = bv->bv_offset;
284 goto loop;
287 static void
288 fhash(struct frame *f)
290 struct aoedev *d = f->t->d;
291 u32 n;
293 n = f->tag % NFACTIVE;
294 list_add_tail(&f->head, &d->factive[n]);
297 static int
298 aoecmd_ata_rw(struct aoedev *d)
300 struct frame *f;
301 struct aoe_hdr *h;
302 struct aoe_atahdr *ah;
303 struct buf *buf;
304 struct aoetgt *t;
305 struct sk_buff *skb;
306 struct sk_buff_head queue;
307 ulong bcnt, fbcnt;
308 char writebit, extbit;
310 writebit = 0x10;
311 extbit = 0x4;
313 buf = nextbuf(d);
314 if (buf == NULL)
315 return 0;
316 f = newframe(d);
317 if (f == NULL)
318 return 0;
319 t = *d->tgt;
320 bcnt = d->maxbcnt;
321 if (bcnt == 0)
322 bcnt = DEFAULTBCNT;
323 if (bcnt > buf->resid)
324 bcnt = buf->resid;
325 fbcnt = bcnt;
326 f->bv = buf->bv;
327 f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
328 do {
329 if (fbcnt < buf->bv_resid) {
330 buf->bv_resid -= fbcnt;
331 buf->resid -= fbcnt;
332 break;
334 fbcnt -= buf->bv_resid;
335 buf->resid -= buf->bv_resid;
336 if (buf->resid == 0) {
337 d->ip.buf = NULL;
338 break;
340 buf->bv++;
341 buf->bv_resid = buf->bv->bv_len;
342 WARN_ON(buf->bv_resid == 0);
343 } while (fbcnt);
345 /* initialize the headers & frame */
346 skb = f->skb;
347 h = (struct aoe_hdr *) skb_mac_header(skb);
348 ah = (struct aoe_atahdr *) (h+1);
349 skb_put(skb, sizeof *h + sizeof *ah);
350 memset(h, 0, skb->len);
351 f->tag = aoehdr_atainit(d, t, h);
352 fhash(f);
353 t->nout++;
354 f->waited = 0;
355 f->buf = buf;
356 f->bcnt = bcnt;
357 f->lba = buf->sector;
359 /* set up ata header */
360 ah->scnt = bcnt >> 9;
361 put_lba(ah, buf->sector);
362 if (d->flags & DEVFL_EXT) {
363 ah->aflags |= AOEAFL_EXT;
364 } else {
365 extbit = 0;
366 ah->lba3 &= 0x0f;
367 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
369 if (bio_data_dir(buf->bio) == WRITE) {
370 skb_fillup(skb, f->bv, f->bv_off, bcnt);
371 ah->aflags |= AOEAFL_WRITE;
372 skb->len += bcnt;
373 skb->data_len = bcnt;
374 skb->truesize += bcnt;
375 t->wpkts++;
376 } else {
377 t->rpkts++;
378 writebit = 0;
381 ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
383 /* mark all tracking fields and load out */
384 buf->nframesout += 1;
385 buf->sector += bcnt >> 9;
387 skb->dev = t->ifp->nd;
388 skb = skb_clone(skb, GFP_ATOMIC);
389 if (skb) {
390 do_gettimeofday(&f->sent);
391 f->sent_jiffs = (u32) jiffies;
392 __skb_queue_head_init(&queue);
393 __skb_queue_tail(&queue, skb);
394 aoenet_xmit(&queue);
396 return 1;
399 /* some callers cannot sleep, and they can call this function,
400 * transmitting the packets later, when interrupts are on
402 static void
403 aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
405 struct aoe_hdr *h;
406 struct aoe_cfghdr *ch;
407 struct sk_buff *skb;
408 struct net_device *ifp;
410 rcu_read_lock();
411 for_each_netdev_rcu(&init_net, ifp) {
412 dev_hold(ifp);
413 if (!is_aoe_netif(ifp))
414 goto cont;
416 skb = new_skb(sizeof *h + sizeof *ch);
417 if (skb == NULL) {
418 printk(KERN_INFO "aoe: skb alloc failure\n");
419 goto cont;
421 skb_put(skb, sizeof *h + sizeof *ch);
422 skb->dev = ifp;
423 __skb_queue_tail(queue, skb);
424 h = (struct aoe_hdr *) skb_mac_header(skb);
425 memset(h, 0, sizeof *h + sizeof *ch);
427 memset(h->dst, 0xff, sizeof h->dst);
428 memcpy(h->src, ifp->dev_addr, sizeof h->src);
429 h->type = __constant_cpu_to_be16(ETH_P_AOE);
430 h->verfl = AOE_HVER;
431 h->major = cpu_to_be16(aoemajor);
432 h->minor = aoeminor;
433 h->cmd = AOECMD_CFG;
435 cont:
436 dev_put(ifp);
438 rcu_read_unlock();
441 static void
442 resend(struct aoedev *d, struct frame *f)
444 struct sk_buff *skb;
445 struct sk_buff_head queue;
446 struct aoe_hdr *h;
447 struct aoe_atahdr *ah;
448 struct aoetgt *t;
449 char buf[128];
450 u32 n;
452 t = f->t;
453 n = newtag(d);
454 skb = f->skb;
455 if (ifrotate(t) == NULL) {
456 /* probably can't happen, but set it up to fail anyway */
457 pr_info("aoe: resend: no interfaces to rotate to.\n");
458 ktcomplete(f, NULL);
459 return;
461 h = (struct aoe_hdr *) skb_mac_header(skb);
462 ah = (struct aoe_atahdr *) (h+1);
464 snprintf(buf, sizeof buf,
465 "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
466 "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
467 h->src, h->dst, t->nout);
468 aoechr_error(buf);
470 f->tag = n;
471 fhash(f);
472 h->tag = cpu_to_be32(n);
473 memcpy(h->dst, t->addr, sizeof h->dst);
474 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
476 skb->dev = t->ifp->nd;
477 skb = skb_clone(skb, GFP_ATOMIC);
478 if (skb == NULL)
479 return;
480 do_gettimeofday(&f->sent);
481 f->sent_jiffs = (u32) jiffies;
482 __skb_queue_head_init(&queue);
483 __skb_queue_tail(&queue, skb);
484 aoenet_xmit(&queue);
487 static int
488 tsince_hr(struct frame *f)
490 struct timeval now;
491 int n;
493 do_gettimeofday(&now);
494 n = now.tv_usec - f->sent.tv_usec;
495 n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
497 if (n < 0)
498 n = -n;
500 /* For relatively long periods, use jiffies to avoid
501 * discrepancies caused by updates to the system time.
503 * On system with HZ of 1000, 32-bits is over 49 days
504 * worth of jiffies, or over 71 minutes worth of usecs.
506 * Jiffies overflow is handled by subtraction of unsigned ints:
507 * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
508 * $3 = 4
509 * (gdb)
511 if (n > USEC_PER_SEC / 4) {
512 n = ((u32) jiffies) - f->sent_jiffs;
513 n *= USEC_PER_SEC / HZ;
516 return n;
519 static int
520 tsince(u32 tag)
522 int n;
524 n = jiffies & 0xffff;
525 n -= tag & 0xffff;
526 if (n < 0)
527 n += 1<<16;
528 return jiffies_to_usecs(n + 1);
531 static struct aoeif *
532 getif(struct aoetgt *t, struct net_device *nd)
534 struct aoeif *p, *e;
536 p = t->ifs;
537 e = p + NAOEIFS;
538 for (; p < e; p++)
539 if (p->nd == nd)
540 return p;
541 return NULL;
544 static void
545 ejectif(struct aoetgt *t, struct aoeif *ifp)
547 struct aoeif *e;
548 struct net_device *nd;
549 ulong n;
551 nd = ifp->nd;
552 e = t->ifs + NAOEIFS - 1;
553 n = (e - ifp) * sizeof *ifp;
554 memmove(ifp, ifp+1, n);
555 e->nd = NULL;
556 dev_put(nd);
559 static int
560 sthtith(struct aoedev *d)
562 struct frame *f, *nf;
563 struct list_head *nx, *pos, *head;
564 struct sk_buff *skb;
565 struct aoetgt *ht = d->htgt;
566 int i;
568 for (i = 0; i < NFACTIVE; i++) {
569 head = &d->factive[i];
570 list_for_each_safe(pos, nx, head) {
571 f = list_entry(pos, struct frame, head);
572 if (f->t != ht)
573 continue;
575 nf = newframe(d);
576 if (!nf)
577 return 0;
579 /* remove frame from active list */
580 list_del(pos);
582 /* reassign all pertinent bits to new outbound frame */
583 skb = nf->skb;
584 nf->skb = f->skb;
585 nf->buf = f->buf;
586 nf->bcnt = f->bcnt;
587 nf->lba = f->lba;
588 nf->bv = f->bv;
589 nf->bv_off = f->bv_off;
590 nf->waited = 0;
591 nf->sent_jiffs = f->sent_jiffs;
592 f->skb = skb;
593 aoe_freetframe(f);
594 ht->nout--;
595 nf->t->nout++;
596 resend(d, nf);
599 /* We've cleaned up the outstanding so take away his
600 * interfaces so he won't be used. We should remove him from
601 * the target array here, but cleaning up a target is
602 * involved. PUNT!
604 memset(ht->ifs, 0, sizeof ht->ifs);
605 d->htgt = NULL;
606 return 1;
609 static void
610 rexmit_deferred(struct aoedev *d)
612 struct aoetgt *t;
613 struct frame *f;
614 struct list_head *pos, *nx, *head;
616 head = &d->rexmitq;
617 list_for_each_safe(pos, nx, head) {
618 f = list_entry(pos, struct frame, head);
619 t = f->t;
620 if (t->nout >= t->maxout)
621 continue;
622 list_del(pos);
623 t->nout++;
624 resend(d, f);
628 static void
629 rexmit_timer(ulong vp)
631 struct aoedev *d;
632 struct aoetgt *t;
633 struct aoeif *ifp;
634 struct frame *f;
635 struct list_head *head, *pos, *nx;
636 LIST_HEAD(flist);
637 register long timeout;
638 ulong flags, n;
639 int i;
641 d = (struct aoedev *) vp;
643 spin_lock_irqsave(&d->lock, flags);
645 /* timeout based on observed timings and variations */
646 timeout = 2 * d->rttavg >> RTTSCALE;
647 timeout += 8 * d->rttdev >> RTTDSCALE;
648 if (timeout == 0)
649 timeout = 1;
651 if (d->flags & DEVFL_TKILL) {
652 spin_unlock_irqrestore(&d->lock, flags);
653 return;
656 /* collect all frames to rexmit into flist */
657 for (i = 0; i < NFACTIVE; i++) {
658 head = &d->factive[i];
659 list_for_each_safe(pos, nx, head) {
660 f = list_entry(pos, struct frame, head);
661 if (tsince_hr(f) < timeout)
662 break; /* end of expired frames */
663 /* move to flist for later processing */
664 list_move_tail(pos, &flist);
668 /* process expired frames */
669 while (!list_empty(&flist)) {
670 pos = flist.next;
671 f = list_entry(pos, struct frame, head);
672 n = f->waited += tsince_hr(f);
673 n /= USEC_PER_SEC;
674 if (n > aoe_deadsecs) {
675 /* Waited too long. Device failure.
676 * Hang all frames on first hash bucket for downdev
677 * to clean up.
679 list_splice(&flist, &d->factive[0]);
680 aoedev_downdev(d);
681 goto out;
684 t = f->t;
685 if (n > aoe_deadsecs/2)
686 d->htgt = t; /* see if another target can help */
688 if (t->maxout != 1) {
689 t->ssthresh = t->maxout / 2;
690 t->maxout = 1;
693 ifp = getif(t, f->skb->dev);
694 if (ifp && ++ifp->lost > (t->nframes << 1)
695 && (ifp != t->ifs || t->ifs[1].nd)) {
696 ejectif(t, ifp);
697 ifp = NULL;
699 list_move_tail(pos, &d->rexmitq);
700 t->nout--;
702 rexmit_deferred(d);
704 out:
705 if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
706 d->flags &= ~DEVFL_KICKME;
707 d->blkq->request_fn(d->blkq);
710 d->timer.expires = jiffies + TIMERTICK;
711 add_timer(&d->timer);
713 spin_unlock_irqrestore(&d->lock, flags);
716 static unsigned long
717 rqbiocnt(struct request *r)
719 struct bio *bio;
720 unsigned long n = 0;
722 __rq_for_each_bio(bio, r)
723 n++;
724 return n;
727 /* This can be removed if we are certain that no users of the block
728 * layer will ever use zero-count pages in bios. Otherwise we have to
729 * protect against the put_page sometimes done by the network layer.
731 * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
732 * discussion.
734 * We cannot use get_page in the workaround, because it insists on a
735 * positive page count as a precondition. So we use _count directly.
737 static void
738 bio_pageinc(struct bio *bio)
740 struct bio_vec *bv;
741 struct page *page;
742 int i;
744 bio_for_each_segment(bv, bio, i) {
745 page = bv->bv_page;
746 /* Non-zero page count for non-head members of
747 * compound pages is no longer allowed by the kernel,
748 * but this has never been seen here.
750 if (unlikely(PageCompound(page)))
751 if (compound_trans_head(page) != page) {
752 pr_crit("page tail used for block I/O\n");
753 BUG();
755 atomic_inc(&page->_count);
759 static void
760 bio_pagedec(struct bio *bio)
762 struct bio_vec *bv;
763 int i;
765 bio_for_each_segment(bv, bio, i)
766 atomic_dec(&bv->bv_page->_count);
769 static void
770 bufinit(struct buf *buf, struct request *rq, struct bio *bio)
772 struct bio_vec *bv;
774 memset(buf, 0, sizeof(*buf));
775 buf->rq = rq;
776 buf->bio = bio;
777 buf->resid = bio->bi_size;
778 buf->sector = bio->bi_sector;
779 bio_pageinc(bio);
780 buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
781 buf->bv_resid = bv->bv_len;
782 WARN_ON(buf->bv_resid == 0);
785 static struct buf *
786 nextbuf(struct aoedev *d)
788 struct request *rq;
789 struct request_queue *q;
790 struct buf *buf;
791 struct bio *bio;
793 q = d->blkq;
794 if (q == NULL)
795 return NULL; /* initializing */
796 if (d->ip.buf)
797 return d->ip.buf;
798 rq = d->ip.rq;
799 if (rq == NULL) {
800 rq = blk_peek_request(q);
801 if (rq == NULL)
802 return NULL;
803 blk_start_request(rq);
804 d->ip.rq = rq;
805 d->ip.nxbio = rq->bio;
806 rq->special = (void *) rqbiocnt(rq);
808 buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
809 if (buf == NULL) {
810 pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
811 return NULL;
813 bio = d->ip.nxbio;
814 bufinit(buf, rq, bio);
815 bio = bio->bi_next;
816 d->ip.nxbio = bio;
817 if (bio == NULL)
818 d->ip.rq = NULL;
819 return d->ip.buf = buf;
822 /* enters with d->lock held */
823 void
824 aoecmd_work(struct aoedev *d)
826 if (d->htgt && !sthtith(d))
827 return;
828 rexmit_deferred(d);
829 while (aoecmd_ata_rw(d))
833 /* this function performs work that has been deferred until sleeping is OK
835 void
836 aoecmd_sleepwork(struct work_struct *work)
838 struct aoedev *d = container_of(work, struct aoedev, work);
839 struct block_device *bd;
840 u64 ssize;
842 if (d->flags & DEVFL_GDALLOC)
843 aoeblk_gdalloc(d);
845 if (d->flags & DEVFL_NEWSIZE) {
846 ssize = get_capacity(d->gd);
847 bd = bdget_disk(d->gd, 0);
848 if (bd) {
849 mutex_lock(&bd->bd_inode->i_mutex);
850 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
851 mutex_unlock(&bd->bd_inode->i_mutex);
852 bdput(bd);
854 spin_lock_irq(&d->lock);
855 d->flags |= DEVFL_UP;
856 d->flags &= ~DEVFL_NEWSIZE;
857 spin_unlock_irq(&d->lock);
861 static void
862 ata_ident_fixstring(u16 *id, int ns)
864 u16 s;
866 while (ns-- > 0) {
867 s = *id;
868 *id++ = s >> 8 | s << 8;
872 static void
873 ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
875 u64 ssize;
876 u16 n;
878 /* word 83: command set supported */
879 n = get_unaligned_le16(&id[83 << 1]);
881 /* word 86: command set/feature enabled */
882 n |= get_unaligned_le16(&id[86 << 1]);
884 if (n & (1<<10)) { /* bit 10: LBA 48 */
885 d->flags |= DEVFL_EXT;
887 /* word 100: number lba48 sectors */
888 ssize = get_unaligned_le64(&id[100 << 1]);
890 /* set as in ide-disk.c:init_idedisk_capacity */
891 d->geo.cylinders = ssize;
892 d->geo.cylinders /= (255 * 63);
893 d->geo.heads = 255;
894 d->geo.sectors = 63;
895 } else {
896 d->flags &= ~DEVFL_EXT;
898 /* number lba28 sectors */
899 ssize = get_unaligned_le32(&id[60 << 1]);
901 /* NOTE: obsolete in ATA 6 */
902 d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
903 d->geo.heads = get_unaligned_le16(&id[55 << 1]);
904 d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
907 ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
908 ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
909 ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */
910 memcpy(d->ident, id, sizeof(d->ident));
912 if (d->ssize != ssize)
913 printk(KERN_INFO
914 "aoe: %pm e%ld.%d v%04x has %llu sectors\n",
915 t->addr,
916 d->aoemajor, d->aoeminor,
917 d->fw_ver, (long long)ssize);
918 d->ssize = ssize;
919 d->geo.start = 0;
920 if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
921 return;
922 if (d->gd != NULL) {
923 set_capacity(d->gd, ssize);
924 d->flags |= DEVFL_NEWSIZE;
925 } else
926 d->flags |= DEVFL_GDALLOC;
927 schedule_work(&d->work);
930 static void
931 calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
933 register long n;
935 n = rtt;
937 /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
938 n -= d->rttavg >> RTTSCALE;
939 d->rttavg += n;
940 if (n < 0)
941 n = -n;
942 n -= d->rttdev >> RTTDSCALE;
943 d->rttdev += n;
945 if (!t || t->maxout >= t->nframes)
946 return;
947 if (t->maxout < t->ssthresh)
948 t->maxout += 1;
949 else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
950 t->maxout += 1;
951 t->next_cwnd = t->maxout;
955 static struct aoetgt *
956 gettgt(struct aoedev *d, char *addr)
958 struct aoetgt **t, **e;
960 t = d->targets;
961 e = t + NTARGETS;
962 for (; t < e && *t; t++)
963 if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
964 return *t;
965 return NULL;
968 static void
969 bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
971 ulong fcnt;
972 char *p;
973 int soff = 0;
974 loop:
975 fcnt = bv->bv_len - (off - bv->bv_offset);
976 if (fcnt > cnt)
977 fcnt = cnt;
978 p = page_address(bv->bv_page) + off;
979 skb_copy_bits(skb, soff, p, fcnt);
980 soff += fcnt;
981 cnt -= fcnt;
982 if (cnt <= 0)
983 return;
984 bv++;
985 off = bv->bv_offset;
986 goto loop;
989 void
990 aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
992 struct bio *bio;
993 int bok;
994 struct request_queue *q;
996 q = d->blkq;
997 if (rq == d->ip.rq)
998 d->ip.rq = NULL;
999 do {
1000 bio = rq->bio;
1001 bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
1002 } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
1004 /* cf. http://lkml.org/lkml/2006/10/31/28 */
1005 if (!fastfail)
1006 __blk_run_queue(q);
1009 static void
1010 aoe_end_buf(struct aoedev *d, struct buf *buf)
1012 struct request *rq;
1013 unsigned long n;
1015 if (buf == d->ip.buf)
1016 d->ip.buf = NULL;
1017 rq = buf->rq;
1018 bio_pagedec(buf->bio);
1019 mempool_free(buf, d->bufpool);
1020 n = (unsigned long) rq->special;
1021 rq->special = (void *) --n;
1022 if (n == 0)
1023 aoe_end_request(d, rq, 0);
1026 static void
1027 ktiocomplete(struct frame *f)
1029 struct aoe_hdr *hin, *hout;
1030 struct aoe_atahdr *ahin, *ahout;
1031 struct buf *buf;
1032 struct sk_buff *skb;
1033 struct aoetgt *t;
1034 struct aoeif *ifp;
1035 struct aoedev *d;
1036 long n;
1038 if (f == NULL)
1039 return;
1041 t = f->t;
1042 d = t->d;
1044 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
1045 ahout = (struct aoe_atahdr *) (hout+1);
1046 buf = f->buf;
1047 skb = f->r_skb;
1048 if (skb == NULL)
1049 goto noskb; /* just fail the buf. */
1051 hin = (struct aoe_hdr *) skb->data;
1052 skb_pull(skb, sizeof(*hin));
1053 ahin = (struct aoe_atahdr *) skb->data;
1054 skb_pull(skb, sizeof(*ahin));
1055 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
1056 pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
1057 ahout->cmdstat, ahin->cmdstat,
1058 d->aoemajor, d->aoeminor);
1059 noskb: if (buf)
1060 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1061 goto badrsp;
1064 n = ahout->scnt << 9;
1065 switch (ahout->cmdstat) {
1066 case ATA_CMD_PIO_READ:
1067 case ATA_CMD_PIO_READ_EXT:
1068 if (skb->len < n) {
1069 pr_err("aoe: runt data size in read. skb->len=%d need=%ld\n",
1070 skb->len, n);
1071 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1072 break;
1074 bvcpy(f->bv, f->bv_off, skb, n);
1075 case ATA_CMD_PIO_WRITE:
1076 case ATA_CMD_PIO_WRITE_EXT:
1077 spin_lock_irq(&d->lock);
1078 ifp = getif(t, skb->dev);
1079 if (ifp)
1080 ifp->lost = 0;
1081 if (d->htgt == t) /* I'll help myself, thank you. */
1082 d->htgt = NULL;
1083 spin_unlock_irq(&d->lock);
1084 break;
1085 case ATA_CMD_ID_ATA:
1086 if (skb->len < 512) {
1087 pr_info("aoe: runt data size in ataid. skb->len=%d\n",
1088 skb->len);
1089 break;
1091 if (skb_linearize(skb))
1092 break;
1093 spin_lock_irq(&d->lock);
1094 ataid_complete(d, t, skb->data);
1095 spin_unlock_irq(&d->lock);
1096 break;
1097 default:
1098 pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
1099 ahout->cmdstat,
1100 be16_to_cpu(get_unaligned(&hin->major)),
1101 hin->minor);
1103 badrsp:
1104 spin_lock_irq(&d->lock);
1106 aoe_freetframe(f);
1108 if (buf && --buf->nframesout == 0 && buf->resid == 0)
1109 aoe_end_buf(d, buf);
1111 aoecmd_work(d);
1113 spin_unlock_irq(&d->lock);
1114 aoedev_put(d);
1115 dev_kfree_skb(skb);
1118 /* Enters with iocq.lock held.
1119 * Returns true iff responses needing processing remain.
1121 static int
1122 ktio(void)
1124 struct frame *f;
1125 struct list_head *pos;
1126 int i;
1128 for (i = 0; ; ++i) {
1129 if (i == MAXIOC)
1130 return 1;
1131 if (list_empty(&iocq.head))
1132 return 0;
1133 pos = iocq.head.next;
1134 list_del(pos);
1135 spin_unlock_irq(&iocq.lock);
1136 f = list_entry(pos, struct frame, head);
1137 ktiocomplete(f);
1138 spin_lock_irq(&iocq.lock);
1142 static int
1143 kthread(void *vp)
1145 struct ktstate *k;
1146 DECLARE_WAITQUEUE(wait, current);
1147 int more;
1149 k = vp;
1150 current->flags |= PF_NOFREEZE;
1151 set_user_nice(current, -10);
1152 complete(&k->rendez); /* tell spawner we're running */
1153 do {
1154 spin_lock_irq(k->lock);
1155 more = k->fn();
1156 if (!more) {
1157 add_wait_queue(k->waitq, &wait);
1158 __set_current_state(TASK_INTERRUPTIBLE);
1160 spin_unlock_irq(k->lock);
1161 if (!more) {
1162 schedule();
1163 remove_wait_queue(k->waitq, &wait);
1164 } else
1165 cond_resched();
1166 } while (!kthread_should_stop());
1167 complete(&k->rendez); /* tell spawner we're stopping */
1168 return 0;
1171 void
1172 aoe_ktstop(struct ktstate *k)
1174 kthread_stop(k->task);
1175 wait_for_completion(&k->rendez);
1179 aoe_ktstart(struct ktstate *k)
1181 struct task_struct *task;
1183 init_completion(&k->rendez);
1184 task = kthread_run(kthread, k, k->name);
1185 if (task == NULL || IS_ERR(task))
1186 return -ENOMEM;
1187 k->task = task;
1188 wait_for_completion(&k->rendez); /* allow kthread to start */
1189 init_completion(&k->rendez); /* for waiting for exit later */
1190 return 0;
1193 /* pass it off to kthreads for processing */
1194 static void
1195 ktcomplete(struct frame *f, struct sk_buff *skb)
1197 ulong flags;
1199 f->r_skb = skb;
1200 spin_lock_irqsave(&iocq.lock, flags);
1201 list_add_tail(&f->head, &iocq.head);
1202 spin_unlock_irqrestore(&iocq.lock, flags);
1203 wake_up(&ktiowq);
1206 struct sk_buff *
1207 aoecmd_ata_rsp(struct sk_buff *skb)
1209 struct aoedev *d;
1210 struct aoe_hdr *h;
1211 struct frame *f;
1212 u32 n;
1213 ulong flags;
1214 char ebuf[128];
1215 u16 aoemajor;
1217 h = (struct aoe_hdr *) skb->data;
1218 aoemajor = be16_to_cpu(get_unaligned(&h->major));
1219 d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
1220 if (d == NULL) {
1221 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
1222 "for unknown device %d.%d\n",
1223 aoemajor, h->minor);
1224 aoechr_error(ebuf);
1225 return skb;
1228 spin_lock_irqsave(&d->lock, flags);
1230 n = be32_to_cpu(get_unaligned(&h->tag));
1231 f = getframe(d, n);
1232 if (f) {
1233 calc_rttavg(d, f->t, tsince_hr(f));
1234 f->t->nout--;
1235 } else {
1236 f = getframe_deferred(d, n);
1237 if (f) {
1238 calc_rttavg(d, NULL, tsince_hr(f));
1239 } else {
1240 calc_rttavg(d, NULL, tsince(n));
1241 spin_unlock_irqrestore(&d->lock, flags);
1242 aoedev_put(d);
1243 snprintf(ebuf, sizeof(ebuf),
1244 "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n",
1245 "unexpected rsp",
1246 get_unaligned_be16(&h->major),
1247 h->minor,
1248 get_unaligned_be32(&h->tag),
1249 jiffies,
1250 h->src,
1251 h->dst);
1252 aoechr_error(ebuf);
1253 return skb;
1256 aoecmd_work(d);
1258 spin_unlock_irqrestore(&d->lock, flags);
1260 ktcomplete(f, skb);
1263 * Note here that we do not perform an aoedev_put, as we are
1264 * leaving this reference for the ktio to release.
1266 return NULL;
1269 void
1270 aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
1272 struct sk_buff_head queue;
1274 __skb_queue_head_init(&queue);
1275 aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
1276 aoenet_xmit(&queue);
1279 struct sk_buff *
1280 aoecmd_ata_id(struct aoedev *d)
1282 struct aoe_hdr *h;
1283 struct aoe_atahdr *ah;
1284 struct frame *f;
1285 struct sk_buff *skb;
1286 struct aoetgt *t;
1288 f = newframe(d);
1289 if (f == NULL)
1290 return NULL;
1292 t = *d->tgt;
1294 /* initialize the headers & frame */
1295 skb = f->skb;
1296 h = (struct aoe_hdr *) skb_mac_header(skb);
1297 ah = (struct aoe_atahdr *) (h+1);
1298 skb_put(skb, sizeof *h + sizeof *ah);
1299 memset(h, 0, skb->len);
1300 f->tag = aoehdr_atainit(d, t, h);
1301 fhash(f);
1302 t->nout++;
1303 f->waited = 0;
1305 /* set up ata header */
1306 ah->scnt = 1;
1307 ah->cmdstat = ATA_CMD_ID_ATA;
1308 ah->lba3 = 0xa0;
1310 skb->dev = t->ifp->nd;
1312 d->rttavg = RTTAVG_INIT;
1313 d->rttdev = RTTDEV_INIT;
1314 d->timer.function = rexmit_timer;
1316 skb = skb_clone(skb, GFP_ATOMIC);
1317 if (skb) {
1318 do_gettimeofday(&f->sent);
1319 f->sent_jiffs = (u32) jiffies;
1322 return skb;
1325 static struct aoetgt *
1326 addtgt(struct aoedev *d, char *addr, ulong nframes)
1328 struct aoetgt *t, **tt, **te;
1330 tt = d->targets;
1331 te = tt + NTARGETS;
1332 for (; tt < te && *tt; tt++)
1335 if (tt == te) {
1336 printk(KERN_INFO
1337 "aoe: device addtgt failure; too many targets\n");
1338 return NULL;
1340 t = kzalloc(sizeof(*t), GFP_ATOMIC);
1341 if (!t) {
1342 printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
1343 return NULL;
1346 d->ntargets++;
1347 t->nframes = nframes;
1348 t->d = d;
1349 memcpy(t->addr, addr, sizeof t->addr);
1350 t->ifp = t->ifs;
1351 aoecmd_wreset(t);
1352 INIT_LIST_HEAD(&t->ffree);
1353 return *tt = t;
1356 static void
1357 setdbcnt(struct aoedev *d)
1359 struct aoetgt **t, **e;
1360 int bcnt = 0;
1362 t = d->targets;
1363 e = t + NTARGETS;
1364 for (; t < e && *t; t++)
1365 if (bcnt == 0 || bcnt > (*t)->minbcnt)
1366 bcnt = (*t)->minbcnt;
1367 if (bcnt != d->maxbcnt) {
1368 d->maxbcnt = bcnt;
1369 pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
1370 d->aoemajor, d->aoeminor, bcnt);
1374 static void
1375 setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
1377 struct aoedev *d;
1378 struct aoeif *p, *e;
1379 int minbcnt;
1381 d = t->d;
1382 minbcnt = bcnt;
1383 p = t->ifs;
1384 e = p + NAOEIFS;
1385 for (; p < e; p++) {
1386 if (p->nd == NULL)
1387 break; /* end of the valid interfaces */
1388 if (p->nd == nd) {
1389 p->bcnt = bcnt; /* we're updating */
1390 nd = NULL;
1391 } else if (minbcnt > p->bcnt)
1392 minbcnt = p->bcnt; /* find the min interface */
1394 if (nd) {
1395 if (p == e) {
1396 pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
1397 return;
1399 dev_hold(nd);
1400 p->nd = nd;
1401 p->bcnt = bcnt;
1403 t->minbcnt = minbcnt;
1404 setdbcnt(d);
1407 void
1408 aoecmd_cfg_rsp(struct sk_buff *skb)
1410 struct aoedev *d;
1411 struct aoe_hdr *h;
1412 struct aoe_cfghdr *ch;
1413 struct aoetgt *t;
1414 ulong flags, aoemajor;
1415 struct sk_buff *sl;
1416 struct sk_buff_head queue;
1417 u16 n;
1419 sl = NULL;
1420 h = (struct aoe_hdr *) skb_mac_header(skb);
1421 ch = (struct aoe_cfghdr *) (h+1);
1424 * Enough people have their dip switches set backwards to
1425 * warrant a loud message for this special case.
1427 aoemajor = get_unaligned_be16(&h->major);
1428 if (aoemajor == 0xfff) {
1429 printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
1430 "Check shelf dip switches.\n");
1431 return;
1433 if (aoemajor == 0xffff) {
1434 pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
1435 aoemajor, (int) h->minor);
1436 return;
1438 if (h->minor == 0xff) {
1439 pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
1440 aoemajor, (int) h->minor);
1441 return;
1444 n = be16_to_cpu(ch->bufcnt);
1445 if (n > aoe_maxout) /* keep it reasonable */
1446 n = aoe_maxout;
1448 d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
1449 if (d == NULL) {
1450 pr_info("aoe: device allocation failure\n");
1451 return;
1454 spin_lock_irqsave(&d->lock, flags);
1456 t = gettgt(d, h->src);
1457 if (t) {
1458 t->nframes = n;
1459 if (n < t->maxout)
1460 aoecmd_wreset(t);
1461 } else {
1462 t = addtgt(d, h->src, n);
1463 if (!t)
1464 goto bail;
1466 n = skb->dev->mtu;
1467 n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
1468 n /= 512;
1469 if (n > ch->scnt)
1470 n = ch->scnt;
1471 n = n ? n * 512 : DEFAULTBCNT;
1472 setifbcnt(t, skb->dev, n);
1474 /* don't change users' perspective */
1475 if (d->nopen == 0) {
1476 d->fw_ver = be16_to_cpu(ch->fwver);
1477 sl = aoecmd_ata_id(d);
1479 bail:
1480 spin_unlock_irqrestore(&d->lock, flags);
1481 aoedev_put(d);
1482 if (sl) {
1483 __skb_queue_head_init(&queue);
1484 __skb_queue_tail(&queue, sl);
1485 aoenet_xmit(&queue);
1489 void
1490 aoecmd_wreset(struct aoetgt *t)
1492 t->maxout = 1;
1493 t->ssthresh = t->nframes / 2;
1494 t->next_cwnd = t->nframes;
1497 void
1498 aoecmd_cleanslate(struct aoedev *d)
1500 struct aoetgt **t, **te;
1502 d->rttavg = RTTAVG_INIT;
1503 d->rttdev = RTTDEV_INIT;
1504 d->maxbcnt = 0;
1506 t = d->targets;
1507 te = t + NTARGETS;
1508 for (; t < te && *t; t++)
1509 aoecmd_wreset(*t);
1512 void
1513 aoe_failbuf(struct aoedev *d, struct buf *buf)
1515 if (buf == NULL)
1516 return;
1517 buf->resid = 0;
1518 clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
1519 if (buf->nframesout == 0)
1520 aoe_end_buf(d, buf);
1523 void
1524 aoe_flush_iocq(void)
1526 struct frame *f;
1527 struct aoedev *d;
1528 LIST_HEAD(flist);
1529 struct list_head *pos;
1530 struct sk_buff *skb;
1531 ulong flags;
1533 spin_lock_irqsave(&iocq.lock, flags);
1534 list_splice_init(&iocq.head, &flist);
1535 spin_unlock_irqrestore(&iocq.lock, flags);
1536 while (!list_empty(&flist)) {
1537 pos = flist.next;
1538 list_del(pos);
1539 f = list_entry(pos, struct frame, head);
1540 d = f->t->d;
1541 skb = f->r_skb;
1542 spin_lock_irqsave(&d->lock, flags);
1543 if (f->buf) {
1544 f->buf->nframesout--;
1545 aoe_failbuf(d, f->buf);
1547 aoe_freetframe(f);
1548 spin_unlock_irqrestore(&d->lock, flags);
1549 dev_kfree_skb(skb);
1550 aoedev_put(d);
1554 int __init
1555 aoecmd_init(void)
1557 INIT_LIST_HEAD(&iocq.head);
1558 spin_lock_init(&iocq.lock);
1559 init_waitqueue_head(&ktiowq);
1560 kts.name = "aoe_ktio";
1561 kts.fn = ktio;
1562 kts.waitq = &ktiowq;
1563 kts.lock = &iocq.lock;
1564 return aoe_ktstart(&kts);
1567 void
1568 aoecmd_exit(void)
1570 aoe_ktstop(&kts);
1571 aoe_flush_iocq();