summary refs log tree commit diff
path: root/drivers/block/aoe/aoecmd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/aoe/aoecmd.c')
-rw-r--r--drivers/block/aoe/aoecmd.c1232
1 files changed, 813 insertions, 419 deletions
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 887f68f6d79a..3804a0af3ef1 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoecmd.c
  * Filesystem request handling methods
@@ -12,10 +12,19 @@
 #include <linux/netdevice.h>
 #include <linux/genhd.h>
 #include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <net/net_namespace.h>
 #include <asm/unaligned.h>
+#include <linux/uio.h>
 #include "aoe.h"
 
+#define MAXIOC (8192)	/* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+
+static struct buf *nextbuf(struct aoedev *);
+
 static int aoe_deadsecs = 60 * 3;
 module_param(aoe_deadsecs, int, 0644);
 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
@@ -25,6 +34,15 @@ module_param(aoe_maxout, int, 0644);
 MODULE_PARM_DESC(aoe_maxout,
 	"Only aoe_maxout outstanding packets for every MAC on eX.Y.");
 
+static wait_queue_head_t ktiowq;
+static struct ktstate kts;
+
+/* io completion queue */
+static struct {
+	struct list_head head;
+	spinlock_t lock;
+} iocq;
+
 static struct sk_buff *
 new_skb(ulong len)
 {
@@ -41,15 +59,21 @@ new_skb(ulong len)
 }
 
 static struct frame *
-getframe(struct aoetgt *t, int tag)
+getframe(struct aoedev *d, u32 tag)
 {
-	struct frame *f, *e;
+	struct frame *f;
+	struct list_head *head, *pos, *nx;
+	u32 n;
 
-	f = t->frames;
-	e = f + t->nframes;
-	for (; f<e; f++)
-		if (f->tag == tag)
+	n = tag % NFACTIVE;
+	head = &d->factive[n];
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		if (f->tag == tag) {
+			list_del(pos);
 			return f;
+		}
+	}
 	return NULL;
 }
 
@@ -59,18 +83,18 @@ getframe(struct aoetgt *t, int tag)
  * This driver reserves tag -1 to mean "unused frame."
  */
 static int
-newtag(struct aoetgt *t)
+newtag(struct aoedev *d)
 {
 	register ulong n;
 
 	n = jiffies & 0xffff;
-	return n |= (++t->lasttag & 0x7fff) << 16;
+	return n |= (++d->lasttag & 0x7fff) << 16;
 }
 
-static int
+static u32
 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
 {
-	u32 host_tag = newtag(t);
+	u32 host_tag = newtag(d);
 
 	memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 	memcpy(h->dst, t->addr, sizeof h->dst);
@@ -95,16 +119,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba)
 	ah->lba5 = lba >>= 8;
 }
 
-static void
+static struct aoeif *
 ifrotate(struct aoetgt *t)
 {
-	t->ifp++;
-	if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
-		t->ifp = t->ifs;
-	if (t->ifp->nd == NULL) {
-		printk(KERN_INFO "aoe: no interface to rotate to\n");
-		BUG();
-	}
+	struct aoeif *ifp;
+
+	ifp = t->ifp;
+	ifp++;
+	if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+		ifp = t->ifs;
+	if (ifp->nd == NULL)
+		return NULL;
+	return t->ifp = ifp;
 }
 
 static void
@@ -129,78 +155,128 @@ skb_pool_get(struct aoedev *d)
 	return NULL;
 }
 
-/* freeframe is where we do our load balancing so it's a little hairy. */
+void
+aoe_freetframe(struct frame *f)
+{
+	struct aoetgt *t;
+
+	t = f->t;
+	f->buf = NULL;
+	f->bv = NULL;
+	f->r_skb = NULL;
+	list_add(&f->head, &t->ffree);
+}
+
 static struct frame *
-freeframe(struct aoedev *d)
+newtframe(struct aoedev *d, struct aoetgt *t)
 {
-	struct frame *f, *e, *rf;
-	struct aoetgt **t;
+	struct frame *f;
 	struct sk_buff *skb;
+	struct list_head *pos;
+
+	if (list_empty(&t->ffree)) {
+		if (t->falloc >= NSKBPOOLMAX*2)
+			return NULL;
+		f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+		if (f == NULL)
+			return NULL;
+		t->falloc++;
+		f->t = t;
+	} else {
+		pos = t->ffree.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+	}
+
+	skb = f->skb;
+	if (skb == NULL) {
+		f->skb = skb = new_skb(ETH_ZLEN);
+		if (!skb) {
+bail:			aoe_freetframe(f);
+			return NULL;
+		}
+	}
+
+	if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+		skb = skb_pool_get(d);
+		if (skb == NULL)
+			goto bail;
+		skb_pool_put(d, f->skb);
+		f->skb = skb;
+	}
+
+	skb->truesize -= skb->data_len;
+	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+	skb_trim(skb, 0);
+	return f;
+}
+
+static struct frame *
+newframe(struct aoedev *d)
+{
+	struct frame *f;
+	struct aoetgt *t, **tt;
+	int totout = 0;
 
 	if (d->targets[0] == NULL) {	/* shouldn't happen, but I'm paranoid */
 		printk(KERN_ERR "aoe: NULL TARGETS!\n");
 		return NULL;
 	}
-	t = d->tgt;
-	t++;
-	if (t >= &d->targets[NTARGETS] || !*t)
-		t = d->targets;
+	tt = d->tgt;	/* last used target */
 	for (;;) {
-		if ((*t)->nout < (*t)->maxout
+		tt++;
+		if (tt >= &d->targets[NTARGETS] || !*tt)
+			tt = d->targets;
+		t = *tt;
+		totout += t->nout;
+		if (t->nout < t->maxout
 		&& t != d->htgt
-		&& (*t)->ifp->nd) {
-			rf = NULL;
-			f = (*t)->frames;
-			e = f + (*t)->nframes;
-			for (; f < e; f++) {
-				if (f->tag != FREETAG)
-					continue;
-				skb = f->skb;
-				if (!skb
-				&& !(f->skb = skb = new_skb(ETH_ZLEN)))
-					continue;
-				if (atomic_read(&skb_shinfo(skb)->dataref)
-					!= 1) {
-					if (!rf)
-						rf = f;
-					continue;
-				}
-gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
-				skb_trim(skb, 0);
-				d->tgt = t;
-				ifrotate(*t);
+		&& t->ifp->nd) {
+			f = newtframe(d, t);
+			if (f) {
+				ifrotate(t);
+				d->tgt = tt;
 				return f;
 			}
-			/* Work can be done, but the network layer is
-			   holding our precious packets.  Try to grab
-			   one from the pool. */
-			f = rf;
-			if (f == NULL) {	/* more paranoia */
-				printk(KERN_ERR
-					"aoe: freeframe: %s.\n",
-					"unexpected null rf");
-				d->flags |= DEVFL_KICKME;
-				return NULL;
-			}
-			skb = skb_pool_get(d);
-			if (skb) {
-				skb_pool_put(d, f->skb);
-				f->skb = skb;
-				goto gotone;
-			}
-			(*t)->dataref++;
-			if ((*t)->nout == 0)
-				d->flags |= DEVFL_KICKME;
 		}
-		if (t == d->tgt)	/* we've looped and found nada */
+		if (tt == d->tgt)	/* we've looped and found nada */
 			break;
-		t++;
-		if (t >= &d->targets[NTARGETS] || !*t)
-			t = d->targets;
+	}
+	if (totout == 0) {
+		d->kicked++;
+		d->flags |= DEVFL_KICKME;
 	}
 	return NULL;
 }
 
+static void
+skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
+{
+	int frag = 0;
+	ulong fcnt;
+loop:
+	fcnt = bv->bv_len - (off - bv->bv_offset);
+	if (fcnt > cnt)
+		fcnt = cnt;
+	skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
+	cnt -= fcnt;
+	if (cnt <= 0)
+		return;
+	bv++;
+	off = bv->bv_offset;
+	goto loop;
+}
+
+static void
+fhash(struct frame *f)
+{
+	struct aoedev *d = f->t->d;
+	u32 n;
+
+	n = f->tag % NFACTIVE;
+	list_add_tail(&f->head, &d->factive[n]);
+}
+
 static int
 aoecmd_ata_rw(struct aoedev *d)
 {
@@ -208,26 +284,47 @@ aoecmd_ata_rw(struct aoedev *d)
 	struct aoe_hdr *h;
 	struct aoe_atahdr *ah;
 	struct buf *buf;
-	struct bio_vec *bv;
 	struct aoetgt *t;
 	struct sk_buff *skb;
-	ulong bcnt;
+	struct sk_buff_head queue;
+	ulong bcnt, fbcnt;
 	char writebit, extbit;
 
 	writebit = 0x10;
 	extbit = 0x4;
 
-	f = freeframe(d);
+	buf = nextbuf(d);
+	if (buf == NULL)
+		return 0;
+	f = newframe(d);
 	if (f == NULL)
 		return 0;
 	t = *d->tgt;
-	buf = d->inprocess;
-	bv = buf->bv;
-	bcnt = t->ifp->maxbcnt;
+	bcnt = d->maxbcnt;
 	if (bcnt == 0)
 		bcnt = DEFAULTBCNT;
-	if (bcnt > buf->bv_resid)
-		bcnt = buf->bv_resid;
+	if (bcnt > buf->resid)
+		bcnt = buf->resid;
+	fbcnt = bcnt;
+	f->bv = buf->bv;
+	f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
+	do {
+		if (fbcnt < buf->bv_resid) {
+			buf->bv_resid -= fbcnt;
+			buf->resid -= fbcnt;
+			break;
+		}
+		fbcnt -= buf->bv_resid;
+		buf->resid -= buf->bv_resid;
+		if (buf->resid == 0) {
+			d->ip.buf = NULL;
+			break;
+		}
+		buf->bv++;
+		buf->bv_resid = buf->bv->bv_len;
+		WARN_ON(buf->bv_resid == 0);
+	} while (fbcnt);
+
 	/* initialize the headers & frame */
 	skb = f->skb;
 	h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -235,10 +332,10 @@ aoecmd_ata_rw(struct aoedev *d)
 	skb_put(skb, sizeof *h + sizeof *ah);
 	memset(h, 0, skb->len);
 	f->tag = aoehdr_atainit(d, t, h);
+	fhash(f);
 	t->nout++;
 	f->waited = 0;
 	f->buf = buf;
-	f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
 	f->bcnt = bcnt;
 	f->lba = buf->sector;
 
@@ -253,10 +350,11 @@ aoecmd_ata_rw(struct aoedev *d)
 		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
 	}
 	if (bio_data_dir(buf->bio) == WRITE) {
-		skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
+		skb_fillup(skb, f->bv, f->bv_off, bcnt);
 		ah->aflags |= AOEAFL_WRITE;
 		skb->len += bcnt;
 		skb->data_len = bcnt;
+		skb->truesize += bcnt;
 		t->wpkts++;
 	} else {
 		t->rpkts++;
@@ -267,23 +365,15 @@ aoecmd_ata_rw(struct aoedev *d)
 
 	/* mark all tracking fields and load out */
 	buf->nframesout += 1;
-	buf->bv_off += bcnt;
-	buf->bv_resid -= bcnt;
-	buf->resid -= bcnt;
 	buf->sector += bcnt >> 9;
-	if (buf->resid == 0) {
-		d->inprocess = NULL;
-	} else if (buf->bv_resid == 0) {
-		buf->bv = ++bv;
-		buf->bv_resid = bv->bv_len;
-		WARN_ON(buf->bv_resid == 0);
-		buf->bv_off = bv->bv_offset;
-	}
 
 	skb->dev = t->ifp->nd;
 	skb = skb_clone(skb, GFP_ATOMIC);
-	if (skb)
-		__skb_queue_tail(&d->sendq, skb);
+	if (skb) {
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
 	return 1;
 }
 
@@ -330,17 +420,25 @@ cont:
 }
 
 static void
-resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
+resend(struct aoedev *d, struct frame *f)
 {
 	struct sk_buff *skb;
+	struct sk_buff_head queue;
 	struct aoe_hdr *h;
 	struct aoe_atahdr *ah;
+	struct aoetgt *t;
 	char buf[128];
 	u32 n;
 
-	ifrotate(t);
-	n = newtag(t);
+	t = f->t;
+	n = newtag(d);
 	skb = f->skb;
+	if (ifrotate(t) == NULL) {
+		/* probably can't happen, but set it up to fail anyway */
+		pr_info("aoe: resend: no interfaces to rotate to.\n");
+		ktcomplete(f, NULL);
+		return;
+	}
 	h = (struct aoe_hdr *) skb_mac_header(skb);
 	ah = (struct aoe_atahdr *) (h+1);
 
@@ -351,39 +449,22 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
 	aoechr_error(buf);
 
 	f->tag = n;
+	fhash(f);
 	h->tag = cpu_to_be32(n);
 	memcpy(h->dst, t->addr, sizeof h->dst);
 	memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 
-	switch (ah->cmdstat) {
-	default:
-		break;
-	case ATA_CMD_PIO_READ:
-	case ATA_CMD_PIO_READ_EXT:
-	case ATA_CMD_PIO_WRITE:
-	case ATA_CMD_PIO_WRITE_EXT:
-		put_lba(ah, f->lba);
-
-		n = f->bcnt;
-		if (n > DEFAULTBCNT)
-			n = DEFAULTBCNT;
-		ah->scnt = n >> 9;
-		if (ah->aflags & AOEAFL_WRITE) {
-			skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
-				offset_in_page(f->bufaddr), n);
-			skb->len = sizeof *h + sizeof *ah + n;
-			skb->data_len = n;
-		}
-	}
 	skb->dev = t->ifp->nd;
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return;
-	__skb_queue_tail(&d->sendq, skb);
+	__skb_queue_head_init(&queue);
+	__skb_queue_tail(&queue, skb);
+	aoenet_xmit(&queue);
 }
 
 static int
-tsince(int tag)
+tsince(u32 tag)
 {
 	int n;
 
@@ -407,58 +488,65 @@ getif(struct aoetgt *t, struct net_device *nd)
 	return NULL;
 }
 
-static struct aoeif *
-addif(struct aoetgt *t, struct net_device *nd)
-{
-	struct aoeif *p;
-
-	p = getif(t, NULL);
-	if (!p)
-		return NULL;
-	p->nd = nd;
-	p->maxbcnt = DEFAULTBCNT;
-	p->lost = 0;
-	p->lostjumbo = 0;
-	return p;
-}
-
 static void
 ejectif(struct aoetgt *t, struct aoeif *ifp)
 {
 	struct aoeif *e;
+	struct net_device *nd;
 	ulong n;
 
+	nd = ifp->nd;
 	e = t->ifs + NAOEIFS - 1;
 	n = (e - ifp) * sizeof *ifp;
 	memmove(ifp, ifp+1, n);
 	e->nd = NULL;
+	dev_put(nd);
 }
 
 static int
 sthtith(struct aoedev *d)
 {
-	struct frame *f, *e, *nf;
+	struct frame *f, *nf;
+	struct list_head *nx, *pos, *head;
 	struct sk_buff *skb;
-	struct aoetgt *ht = *d->htgt;
-
-	f = ht->frames;
-	e = f + ht->nframes;
-	for (; f < e; f++) {
-		if (f->tag == FREETAG)
-			continue;
-		nf = freeframe(d);
-		if (!nf)
-			return 0;
-		skb = nf->skb;
-		*nf = *f;
-		f->skb = skb;
-		f->tag = FREETAG;
-		nf->waited = 0;
-		ht->nout--;
-		(*d->tgt)->nout++;
-		resend(d, *d->tgt, nf);
+	struct aoetgt *ht = d->htgt;
+	int i;
+
+	for (i = 0; i < NFACTIVE; i++) {
+		head = &d->factive[i];
+		list_for_each_safe(pos, nx, head) {
+			f = list_entry(pos, struct frame, head);
+			if (f->t != ht)
+				continue;
+
+			nf = newframe(d);
+			if (!nf)
+				return 0;
+
+			/* remove frame from active list */
+			list_del(pos);
+
+			/* reassign all pertinent bits to new outbound frame */
+			skb = nf->skb;
+			nf->skb = f->skb;
+			nf->buf = f->buf;
+			nf->bcnt = f->bcnt;
+			nf->lba = f->lba;
+			nf->bv = f->bv;
+			nf->bv_off = f->bv_off;
+			nf->waited = 0;
+			f->skb = skb;
+			aoe_freetframe(f);
+			ht->nout--;
+			nf->t->nout++;
+			resend(d, nf);
+		}
 	}
-	/* he's clean, he's useless.  take away his interfaces */
+	/* We've cleaned up the outstanding so take away his
+	 * interfaces so he won't be used.  We should remove him from
+	 * the target array here, but cleaning up a target is
+	 * involved.  PUNT!
+	 */
 	memset(ht->ifs, 0, sizeof ht->ifs);
 	d->htgt = NULL;
 	return 1;
@@ -477,13 +565,15 @@ ata_scnt(unsigned char *packet) {
 static void
 rexmit_timer(ulong vp)
 {
-	struct sk_buff_head queue;
 	struct aoedev *d;
 	struct aoetgt *t, **tt, **te;
 	struct aoeif *ifp;
-	struct frame *f, *e;
+	struct frame *f;
+	struct list_head *head, *pos, *nx;
+	LIST_HEAD(flist);
 	register long timeout;
 	ulong flags, n;
+	int i;
 
 	d = (struct aoedev *) vp;
 
@@ -497,58 +587,22 @@ rexmit_timer(ulong vp)
 		spin_unlock_irqrestore(&d->lock, flags);
 		return;
 	}
-	tt = d->targets;
-	te = tt + NTARGETS;
-	for (; tt < te && *tt; tt++) {
-		t = *tt;
-		f = t->frames;
-		e = f + t->nframes;
-		for (; f < e; f++) {
-			if (f->tag == FREETAG
-			|| tsince(f->tag) < timeout)
-				continue;
-			n = f->waited += timeout;
-			n /= HZ;
-			if (n > aoe_deadsecs) {
-				/* waited too long.  device failure. */
-				aoedev_downdev(d);
-				break;
-			}
-
-			if (n > HELPWAIT /* see if another target can help */
-			&& (tt != d->targets || d->targets[1]))
-				d->htgt = tt;
-
-			if (t->nout == t->maxout) {
-				if (t->maxout > 1)
-					t->maxout--;
-				t->lastwadj = jiffies;
-			}
-
-			ifp = getif(t, f->skb->dev);
-			if (ifp && ++ifp->lost > (t->nframes << 1)
-			&& (ifp != t->ifs || t->ifs[1].nd)) {
-				ejectif(t, ifp);
-				ifp = NULL;
-			}
 
-			if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
-			&& ifp && ++ifp->lostjumbo > (t->nframes << 1)
-			&& ifp->maxbcnt != DEFAULTBCNT) {
-				printk(KERN_INFO
-					"aoe: e%ld.%d: "
-					"too many lost jumbo on "
-					"%s:%pm - "
-					"falling back to %d frames.\n",
-					d->aoemajor, d->aoeminor,
-					ifp->nd->name, t->addr,
-					DEFAULTBCNT);
-				ifp->maxbcnt = 0;
-			}
-			resend(d, t, f);
+	/* collect all frames to rexmit into flist */
+	for (i = 0; i < NFACTIVE; i++) {
+		head = &d->factive[i];
+		list_for_each_safe(pos, nx, head) {
+			f = list_entry(pos, struct frame, head);
+			if (tsince(f->tag) < timeout)
+				break;	/* end of expired frames */
+			/* move to flist for later processing */
+			list_move_tail(pos, &flist);
 		}
-
-		/* window check */
+	}
+	/* window check */
+	tt = d->targets;
+	te = tt + d->ntargets;
+	for (; tt < te && (t = *tt); tt++) {
 		if (t->nout == t->maxout
 		&& t->maxout < t->nframes
 		&& (jiffies - t->lastwadj)/HZ > 10) {
@@ -557,45 +611,173 @@ rexmit_timer(ulong vp)
 		}
 	}
 
-	if (!skb_queue_empty(&d->sendq)) {
+	if (!list_empty(&flist)) {	/* retransmissions necessary */
 		n = d->rttavg <<= 1;
 		if (n > MAXTIMER)
 			d->rttavg = MAXTIMER;
 	}
 
-	if (d->flags & DEVFL_KICKME || d->htgt) {
-		d->flags &= ~DEVFL_KICKME;
-		aoecmd_work(d);
+	/* process expired frames */
+	while (!list_empty(&flist)) {
+		pos = flist.next;
+		f = list_entry(pos, struct frame, head);
+		n = f->waited += timeout;
+		n /= HZ;
+		if (n > aoe_deadsecs) {
+			/* Waited too long.  Device failure.
+			 * Hang all frames on first hash bucket for downdev
+			 * to clean up.
+			 */
+			list_splice(&flist, &d->factive[0]);
+			aoedev_downdev(d);
+			break;
+		}
+		list_del(pos);
+
+		t = f->t;
+		if (n > aoe_deadsecs/2)
+			d->htgt = t; /* see if another target can help */
+
+		if (t->nout == t->maxout) {
+			if (t->maxout > 1)
+				t->maxout--;
+			t->lastwadj = jiffies;
+		}
+
+		ifp = getif(t, f->skb->dev);
+		if (ifp && ++ifp->lost > (t->nframes << 1)
+		&& (ifp != t->ifs || t->ifs[1].nd)) {
+			ejectif(t, ifp);
+			ifp = NULL;
+		}
+		resend(d, f);
 	}
 
-	__skb_queue_head_init(&queue);
-	skb_queue_splice_init(&d->sendq, &queue);
+	if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
+		d->flags &= ~DEVFL_KICKME;
+		d->blkq->request_fn(d->blkq);
+	}
 
 	d->timer.expires = jiffies + TIMERTICK;
 	add_timer(&d->timer);
 
 	spin_unlock_irqrestore(&d->lock, flags);
+}
 
-	aoenet_xmit(&queue);
+static unsigned long
+rqbiocnt(struct request *r)
+{
+	struct bio *bio;
+	unsigned long n = 0;
+
+	__rq_for_each_bio(bio, r)
+		n++;
+	return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios.  Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition.  So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+	struct bio_vec *bv;
+	struct page *page;
+	int i;
+
+	bio_for_each_segment(bv, bio, i) {
+		page = bv->bv_page;
+		/* Non-zero page count for non-head members of
+		 * compound pages is no longer allowed by the kernel,
+		 * but this has never been seen here.
+		 */
+		if (unlikely(PageCompound(page)))
+			if (compound_trans_head(page) != page) {
+				pr_crit("page tail used for block I/O\n");
+				BUG();
+			}
+		atomic_inc(&page->_count);
+	}
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment(bv, bio, i)
+		atomic_dec(&bv->bv_page->_count);
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+	struct bio_vec *bv;
+
+	memset(buf, 0, sizeof(*buf));
+	buf->rq = rq;
+	buf->bio = bio;
+	buf->resid = bio->bi_size;
+	buf->sector = bio->bi_sector;
+	bio_pageinc(bio);
+	buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
+	buf->bv_resid = bv->bv_len;
+	WARN_ON(buf->bv_resid == 0);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+	struct request *rq;
+	struct request_queue *q;
+	struct buf *buf;
+	struct bio *bio;
+
+	q = d->blkq;
+	if (q == NULL)
+		return NULL;	/* initializing */
+	if (d->ip.buf)
+		return d->ip.buf;
+	rq = d->ip.rq;
+	if (rq == NULL) {
+		rq = blk_peek_request(q);
+		if (rq == NULL)
+			return NULL;
+		blk_start_request(rq);
+		d->ip.rq = rq;
+		d->ip.nxbio = rq->bio;
+		rq->special = (void *) rqbiocnt(rq);
+	}
+	buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+	if (buf == NULL) {
+		pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+		return NULL;
+	}
+	bio = d->ip.nxbio;
+	bufinit(buf, rq, bio);
+	bio = bio->bi_next;
+	d->ip.nxbio = bio;
+	if (bio == NULL)
+		d->ip.rq = NULL;
+	return d->ip.buf = buf;
 }
 
 /* enters with d->lock held */
 void
 aoecmd_work(struct aoedev *d)
 {
-	struct buf *buf;
-loop:
 	if (d->htgt && !sthtith(d))
 		return;
-	if (d->inprocess == NULL) {
-		if (list_empty(&d->bufq))
-			return;
-		buf = container_of(d->bufq.next, struct buf, bufs);
-		list_del(d->bufq.next);
-		d->inprocess = buf;
-	}
-	if (aoecmd_ata_rw(d))
-		goto loop;
+	while (aoecmd_ata_rw(d))
+		;
 }
 
 /* this function performs work that has been deferred until sleeping is OK
@@ -604,28 +786,25 @@ void
 aoecmd_sleepwork(struct work_struct *work)
 {
 	struct aoedev *d = container_of(work, struct aoedev, work);
+	struct block_device *bd;
+	u64 ssize;
 
 	if (d->flags & DEVFL_GDALLOC)
 		aoeblk_gdalloc(d);
 
 	if (d->flags & DEVFL_NEWSIZE) {
-		struct block_device *bd;
-		unsigned long flags;
-		u64 ssize;
-
 		ssize = get_capacity(d->gd);
 		bd = bdget_disk(d->gd, 0);
-
 		if (bd) {
 			mutex_lock(&bd->bd_inode->i_mutex);
 			i_size_write(bd->bd_inode, (loff_t)ssize<<9);
 			mutex_unlock(&bd->bd_inode->i_mutex);
 			bdput(bd);
 		}
-		spin_lock_irqsave(&d->lock, flags);
+		spin_lock_irq(&d->lock);
 		d->flags |= DEVFL_UP;
 		d->flags &= ~DEVFL_NEWSIZE;
-		spin_unlock_irqrestore(&d->lock, flags);
+		spin_unlock_irq(&d->lock);
 	}
 }
 
@@ -718,163 +897,299 @@ gettgt(struct aoedev *d, char *addr)
 	return NULL;
 }
 
-static inline void
-diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
+static void
+bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
+{
+	ulong fcnt;
+	char *p;
+	int soff = 0;
+loop:
+	fcnt = bv->bv_len - (off - bv->bv_offset);
+	if (fcnt > cnt)
+		fcnt = cnt;
+	p = page_address(bv->bv_page) + off;
+	skb_copy_bits(skb, soff, p, fcnt);
+	soff += fcnt;
+	cnt -= fcnt;
+	if (cnt <= 0)
+		return;
+	bv++;
+	off = bv->bv_offset;
+	goto loop;
+}
+
+void
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+	struct bio *bio;
+	int bok;
+	struct request_queue *q;
+
+	q = d->blkq;
+	if (rq == d->ip.rq)
+		d->ip.rq = NULL;
+	do {
+		bio = rq->bio;
+		bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+	} while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
+
+	/* cf. http://lkml.org/lkml/2006/10/31/28 */
+	if (!fastfail)
+		q->request_fn(q);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+	struct request *rq;
+	unsigned long n;
+
+	if (buf == d->ip.buf)
+		d->ip.buf = NULL;
+	rq = buf->rq;
+	bio_pagedec(buf->bio);
+	mempool_free(buf, d->bufpool);
+	n = (unsigned long) rq->special;
+	rq->special = (void *) --n;
+	if (n == 0)
+		aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
 {
-	unsigned long n_sect = bio->bi_size >> 9;
-	const int rw = bio_data_dir(bio);
-	struct hd_struct *part;
-	int cpu;
+	struct aoe_hdr *hin, *hout;
+	struct aoe_atahdr *ahin, *ahout;
+	struct buf *buf;
+	struct sk_buff *skb;
+	struct aoetgt *t;
+	struct aoeif *ifp;
+	struct aoedev *d;
+	long n;
+
+	if (f == NULL)
+		return;
+
+	t = f->t;
+	d = t->d;
+
+	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
+	ahout = (struct aoe_atahdr *) (hout+1);
+	buf = f->buf;
+	skb = f->r_skb;
+	if (skb == NULL)
+		goto noskb;	/* just fail the buf. */
+
+	hin = (struct aoe_hdr *) skb->data;
+	skb_pull(skb, sizeof(*hin));
+	ahin = (struct aoe_atahdr *) skb->data;
+	skb_pull(skb, sizeof(*ahin));
+	if (ahin->cmdstat & 0xa9) {	/* these bits cleared on success */
+		pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
+			ahout->cmdstat, ahin->cmdstat,
+			d->aoemajor, d->aoeminor);
+noskb:	if (buf)
+			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+		goto badrsp;
+	}
 
-	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(disk, sector);
+	n = ahout->scnt << 9;
+	switch (ahout->cmdstat) {
+	case ATA_CMD_PIO_READ:
+	case ATA_CMD_PIO_READ_EXT:
+		if (skb->len < n) {
+			pr_err("aoe: runt data size in read.  skb->len=%d need=%ld\n",
+				skb->len, n);
+			clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+			break;
+		}
+		bvcpy(f->bv, f->bv_off, skb, n);
+	case ATA_CMD_PIO_WRITE:
+	case ATA_CMD_PIO_WRITE_EXT:
+		spin_lock_irq(&d->lock);
+		ifp = getif(t, skb->dev);
+		if (ifp)
+			ifp->lost = 0;
+		if (d->htgt == t) /* I'll help myself, thank you. */
+			d->htgt = NULL;
+		spin_unlock_irq(&d->lock);
+		break;
+	case ATA_CMD_ID_ATA:
+		if (skb->len < 512) {
+			pr_info("aoe: runt data size in ataid.  skb->len=%d\n",
+				skb->len);
+			break;
+		}
+		if (skb_linearize(skb))
+			break;
+		spin_lock_irq(&d->lock);
+		ataid_complete(d, t, skb->data);
+		spin_unlock_irq(&d->lock);
+		break;
+	default:
+		pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+			ahout->cmdstat,
+			be16_to_cpu(get_unaligned(&hin->major)),
+			hin->minor);
+	}
+badrsp:
+	spin_lock_irq(&d->lock);
+
+	aoe_freetframe(f);
+
+	if (buf && --buf->nframesout == 0 && buf->resid == 0)
+		aoe_end_buf(d, buf);
+
+	aoecmd_work(d);
+
+	spin_unlock_irq(&d->lock);
+	aoedev_put(d);
+	dev_kfree_skb(skb);
+}
+
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(void)
+{
+	struct frame *f;
+	struct list_head *pos;
+	int i;
 
-	part_stat_inc(cpu, part, ios[rw]);
-	part_stat_add(cpu, part, ticks[rw], duration);
-	part_stat_add(cpu, part, sectors[rw], n_sect);
-	part_stat_add(cpu, part, io_ticks, duration);
+	for (i = 0; ; ++i) {
+		if (i == MAXIOC)
+			return 1;
+		if (list_empty(&iocq.head))
+			return 0;
+		pos = iocq.head.next;
+		list_del(pos);
+		spin_unlock_irq(&iocq.lock);
+		f = list_entry(pos, struct frame, head);
+		ktiocomplete(f);
+		spin_lock_irq(&iocq.lock);
+	}
+}
 
-	part_stat_unlock();
+static int
+kthread(void *vp)
+{
+	struct ktstate *k;
+	DECLARE_WAITQUEUE(wait, current);
+	int more;
+
+	k = vp;
+	current->flags |= PF_NOFREEZE;
+	set_user_nice(current, -10);
+	complete(&k->rendez);	/* tell spawner we're running */
+	do {
+		spin_lock_irq(k->lock);
+		more = k->fn();
+		if (!more) {
+			add_wait_queue(k->waitq, &wait);
+			__set_current_state(TASK_INTERRUPTIBLE);
+		}
+		spin_unlock_irq(k->lock);
+		if (!more) {
+			schedule();
+			remove_wait_queue(k->waitq, &wait);
+		} else
+			cond_resched();
+	} while (!kthread_should_stop());
+	complete(&k->rendez);	/* tell spawner we're stopping */
+	return 0;
 }
 
 void
+aoe_ktstop(struct ktstate *k)
+{
+	kthread_stop(k->task);
+	wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+	struct task_struct *task;
+
+	init_completion(&k->rendez);
+	task = kthread_run(kthread, k, k->name);
+	if (task == NULL || IS_ERR(task))
+		return -ENOMEM;
+	k->task = task;
+	wait_for_completion(&k->rendez); /* allow kthread to start */
+	init_completion(&k->rendez);	/* for waiting for exit later */
+	return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
+{
+	ulong flags;
+
+	f->r_skb = skb;
+	spin_lock_irqsave(&iocq.lock, flags);
+	list_add_tail(&f->head, &iocq.head);
+	spin_unlock_irqrestore(&iocq.lock, flags);
+	wake_up(&ktiowq);
+}
+
+struct sk_buff *
 aoecmd_ata_rsp(struct sk_buff *skb)
 {
-	struct sk_buff_head queue;
 	struct aoedev *d;
-	struct aoe_hdr *hin, *hout;
-	struct aoe_atahdr *ahin, *ahout;
+	struct aoe_hdr *h;
 	struct frame *f;
-	struct buf *buf;
 	struct aoetgt *t;
-	struct aoeif *ifp;
-	register long n;
+	u32 n;
 	ulong flags;
 	char ebuf[128];
 	u16 aoemajor;
 
-	hin = (struct aoe_hdr *) skb_mac_header(skb);
-	aoemajor = get_unaligned_be16(&hin->major);
-	d = aoedev_by_aoeaddr(aoemajor, hin->minor);
+	h = (struct aoe_hdr *) skb->data;
+	aoemajor = be16_to_cpu(get_unaligned(&h->major));
+	d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
 	if (d == NULL) {
 		snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
 			"for unknown device %d.%d\n",
-			 aoemajor, hin->minor);
+			aoemajor, h->minor);
 		aoechr_error(ebuf);
-		return;
+		return skb;
 	}
 
 	spin_lock_irqsave(&d->lock, flags);
 
-	n = get_unaligned_be32(&hin->tag);
-	t = gettgt(d, hin->src);
-	if (t == NULL) {
-		printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
-			d->aoemajor, d->aoeminor, hin->src);
-		spin_unlock_irqrestore(&d->lock, flags);
-		return;
-	}
-	f = getframe(t, n);
+	n = be32_to_cpu(get_unaligned(&h->tag));
+	f = getframe(d, n);
 	if (f == NULL) {
 		calc_rttavg(d, -tsince(n));
 		spin_unlock_irqrestore(&d->lock, flags);
+		aoedev_put(d);
 		snprintf(ebuf, sizeof ebuf,
 			"%15s e%d.%d    tag=%08x@%08lx\n",
 			"unexpected rsp",
-			get_unaligned_be16(&hin->major),
-			hin->minor,
-			get_unaligned_be32(&hin->tag),
+			get_unaligned_be16(&h->major),
+			h->minor,
+			get_unaligned_be32(&h->tag),
 			jiffies);
 		aoechr_error(ebuf);
-		return;
+		return skb;
 	}
-
+	t = f->t;
 	calc_rttavg(d, tsince(f->tag));
-
-	ahin = (struct aoe_atahdr *) (hin+1);
-	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
-	ahout = (struct aoe_atahdr *) (hout+1);
-	buf = f->buf;
-
-	if (ahin->cmdstat & 0xa9) {	/* these bits cleared on success */
-		printk(KERN_ERR
-			"aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
-			ahout->cmdstat, ahin->cmdstat,
-			d->aoemajor, d->aoeminor);
-		if (buf)
-			buf->flags |= BUFFL_FAIL;
-	} else {
-		if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
-			d->htgt = NULL;
-		n = ahout->scnt << 9;
-		switch (ahout->cmdstat) {
-		case ATA_CMD_PIO_READ:
-		case ATA_CMD_PIO_READ_EXT:
-			if (skb->len - sizeof *hin - sizeof *ahin < n) {
-				printk(KERN_ERR
-					"aoe: %s.  skb->len=%d need=%ld\n",
-					"runt data size in read", skb->len, n);
-				/* fail frame f?  just returning will rexmit. */
-				spin_unlock_irqrestore(&d->lock, flags);
-				return;
-			}
-			memcpy(f->bufaddr, ahin+1, n);
-		case ATA_CMD_PIO_WRITE:
-		case ATA_CMD_PIO_WRITE_EXT:
-			ifp = getif(t, skb->dev);
-			if (ifp) {
-				ifp->lost = 0;
-				if (n > DEFAULTBCNT)
-					ifp->lostjumbo = 0;
-			}
-			if (f->bcnt -= n) {
-				f->lba += n >> 9;
-				f->bufaddr += n;
-				resend(d, t, f);
-				goto xmit;
-			}
-			break;
-		case ATA_CMD_ID_ATA:
-			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
-				printk(KERN_INFO
-					"aoe: runt data size in ataid.  skb->len=%d\n",
-					skb->len);
-				spin_unlock_irqrestore(&d->lock, flags);
-				return;
-			}
-			ataid_complete(d, t, (char *) (ahin+1));
-			break;
-		default:
-			printk(KERN_INFO
-				"aoe: unrecognized ata command %2.2Xh for %d.%d\n",
-				ahout->cmdstat,
-				get_unaligned_be16(&hin->major),
-				hin->minor);
-		}
-	}
-
-	if (buf && --buf->nframesout == 0 && buf->resid == 0) {
-		diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
-		if (buf->flags & BUFFL_FAIL)
-			bio_endio(buf->bio, -EIO);
-		else {
-			bio_flush_dcache_pages(buf->bio);
-			bio_endio(buf->bio, 0);
-		}
-		mempool_free(buf, d->bufpool);
-	}
-
-	f->buf = NULL;
-	f->tag = FREETAG;
 	t->nout--;
-
 	aoecmd_work(d);
-xmit:
-	__skb_queue_head_init(&queue);
-	skb_queue_splice_init(&d->sendq, &queue);
 
 	spin_unlock_irqrestore(&d->lock, flags);
-	aoenet_xmit(&queue);
+
+	ktcomplete(f, skb);
+
+	/*
+	 * Note here that we do not perform an aoedev_put, as we are
+	 * leaving this reference for the ktio to release.
+	 */
+	return NULL;
 }
 
 void
@@ -896,7 +1211,7 @@ aoecmd_ata_id(struct aoedev *d)
 	struct sk_buff *skb;
 	struct aoetgt *t;
 
-	f = freeframe(d);
+	f = newframe(d);
 	if (f == NULL)
 		return NULL;
 
@@ -909,6 +1224,7 @@ aoecmd_ata_id(struct aoedev *d)
 	skb_put(skb, sizeof *h + sizeof *ah);
 	memset(h, 0, skb->len);
 	f->tag = aoehdr_atainit(d, t, h);
+	fhash(f);
 	t->nout++;
 	f->waited = 0;
 
@@ -929,7 +1245,6 @@ static struct aoetgt *
 addtgt(struct aoedev *d, char *addr, ulong nframes)
 {
 	struct aoetgt *t, **tt, **te;
-	struct frame *f, *e;
 
 	tt = d->targets;
 	te = tt + NTARGETS;
@@ -941,26 +1256,73 @@ addtgt(struct aoedev *d, char *addr, ulong nframes)
 			"aoe: device addtgt failure; too many targets\n");
 		return NULL;
 	}
-	t = kcalloc(1, sizeof *t, GFP_ATOMIC);
-	f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
-	if (!t || !f) {
-		kfree(f);
-		kfree(t);
+	t = kzalloc(sizeof(*t), GFP_ATOMIC);
+	if (!t) {
 		printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
 		return NULL;
 	}
 
+	d->ntargets++;
 	t->nframes = nframes;
-	t->frames = f;
-	e = f + nframes;
-	for (; f < e; f++)
-		f->tag = FREETAG;
+	t->d = d;
 	memcpy(t->addr, addr, sizeof t->addr);
 	t->ifp = t->ifs;
 	t->maxout = t->nframes;
+	INIT_LIST_HEAD(&t->ffree);
 	return *tt = t;
 }
 
+static void
+setdbcnt(struct aoedev *d)
+{
+	struct aoetgt **t, **e;
+	int bcnt = 0;
+
+	t = d->targets;
+	e = t + NTARGETS;
+	for (; t < e && *t; t++)
+		if (bcnt == 0 || bcnt > (*t)->minbcnt)
+			bcnt = (*t)->minbcnt;
+	if (bcnt != d->maxbcnt) {
+		d->maxbcnt = bcnt;
+		pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+			d->aoemajor, d->aoeminor, bcnt);
+	}
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+	struct aoedev *d;
+	struct aoeif *p, *e;
+	int minbcnt;
+
+	d = t->d;
+	minbcnt = bcnt;
+	p = t->ifs;
+	e = p + NAOEIFS;
+	for (; p < e; p++) {
+		if (p->nd == NULL)
+			break;		/* end of the valid interfaces */
+		if (p->nd == nd) {
+			p->bcnt = bcnt;	/* we're updating */
+			nd = NULL;
+		} else if (minbcnt > p->bcnt)
+			minbcnt = p->bcnt; /* find the min interface */
+	}
+	if (nd) {
+		if (p == e) {
+			pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+			return;
+		}
+		dev_hold(nd);
+		p->nd = nd;
+		p->bcnt = bcnt;
+	}
+	t->minbcnt = minbcnt;
+	setdbcnt(d);
+}
+
 void
 aoecmd_cfg_rsp(struct sk_buff *skb)
 {
@@ -968,11 +1330,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	struct aoe_hdr *h;
 	struct aoe_cfghdr *ch;
 	struct aoetgt *t;
-	struct aoeif *ifp;
-	ulong flags, sysminor, aoemajor;
+	ulong flags, aoemajor;
 	struct sk_buff *sl;
+	struct sk_buff_head queue;
 	u16 n;
 
+	sl = NULL;
 	h = (struct aoe_hdr *) skb_mac_header(skb);
 	ch = (struct aoe_cfghdr *) (h+1);
 
@@ -986,10 +1349,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 			"Check shelf dip switches.\n");
 		return;
 	}
-
-	sysminor = SYSMINOR(aoemajor, h->minor);
-	if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
-		printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
+	if (aoemajor == 0xffff) {
+		pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
+			aoemajor, (int) h->minor);
+		return;
+	}
+	if (h->minor == 0xff) {
+		pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
 			aoemajor, (int) h->minor);
 		return;
 	}
@@ -998,9 +1364,9 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	if (n > aoe_maxout)	/* keep it reasonable */
 		n = aoe_maxout;
 
-	d = aoedev_by_sysminor_m(sysminor);
+	d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
 	if (d == NULL) {
-		printk(KERN_INFO "aoe: device sysminor_m failure\n");
+		pr_info("aoe: device allocation failure\n");
 		return;
 	}
 
@@ -1009,52 +1375,26 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 	t = gettgt(d, h->src);
 	if (!t) {
 		t = addtgt(d, h->src, n);
-		if (!t) {
-			spin_unlock_irqrestore(&d->lock, flags);
-			return;
-		}
-	}
-	ifp = getif(t, skb->dev);
-	if (!ifp) {
-		ifp = addif(t, skb->dev);
-		if (!ifp) {
-			printk(KERN_INFO
-				"aoe: device addif failure; "
-				"too many interfaces?\n");
-			spin_unlock_irqrestore(&d->lock, flags);
-			return;
-		}
-	}
-	if (ifp->maxbcnt) {
-		n = ifp->nd->mtu;
-		n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
-		n /= 512;
-		if (n > ch->scnt)
-			n = ch->scnt;
-		n = n ? n * 512 : DEFAULTBCNT;
-		if (n != ifp->maxbcnt) {
-			printk(KERN_INFO
-				"aoe: e%ld.%d: setting %d%s%s:%pm\n",
-				d->aoemajor, d->aoeminor, n,
-				" byte data frames on ", ifp->nd->name,
-				t->addr);
-			ifp->maxbcnt = n;
-		}
+		if (!t)
+			goto bail;
 	}
+	n = skb->dev->mtu;
+	n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+	n /= 512;
+	if (n > ch->scnt)
+		n = ch->scnt;
+	n = n ? n * 512 : DEFAULTBCNT;
+	setifbcnt(t, skb->dev, n);
 
 	/* don't change users' perspective */
-	if (d->nopen) {
-		spin_unlock_irqrestore(&d->lock, flags);
-		return;
+	if (d->nopen == 0) {
+		d->fw_ver = be16_to_cpu(ch->fwver);
+		sl = aoecmd_ata_id(d);
 	}
-	d->fw_ver = be16_to_cpu(ch->fwver);
-
-	sl = aoecmd_ata_id(d);
-
+bail:
 	spin_unlock_irqrestore(&d->lock, flags);
-
+	aoedev_put(d);
 	if (sl) {
-		struct sk_buff_head queue;
 		__skb_queue_head_init(&queue);
 		__skb_queue_tail(&queue, sl);
 		aoenet_xmit(&queue);
@@ -1065,20 +1405,74 @@ void
 aoecmd_cleanslate(struct aoedev *d)
 {
 	struct aoetgt **t, **te;
-	struct aoeif *p, *e;
 
 	d->mintimer = MINTIMER;
+	d->maxbcnt = 0;
 
 	t = d->targets;
 	te = t + NTARGETS;
-	for (; t < te && *t; t++) {
+	for (; t < te && *t; t++)
 		(*t)->maxout = (*t)->nframes;
-		p = (*t)->ifs;
-		e = p + NAOEIFS;
-		for (; p < e; p++) {
-			p->lostjumbo = 0;
-			p->lost = 0;
-			p->maxbcnt = DEFAULTBCNT;
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+	if (buf == NULL)
+		return;
+	buf->resid = 0;
+	clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+	if (buf->nframesout == 0)
+		aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+	struct frame *f;
+	struct aoedev *d;
+	LIST_HEAD(flist);
+	struct list_head *pos;
+	struct sk_buff *skb;
+	ulong flags;
+
+	spin_lock_irqsave(&iocq.lock, flags);
+	list_splice_init(&iocq.head, &flist);
+	spin_unlock_irqrestore(&iocq.lock, flags);
+	while (!list_empty(&flist)) {
+		pos = flist.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+		d = f->t->d;
+		skb = f->r_skb;
+		spin_lock_irqsave(&d->lock, flags);
+		if (f->buf) {
+			f->buf->nframesout--;
+			aoe_failbuf(d, f->buf);
 		}
+		aoe_freetframe(f);
+		spin_unlock_irqrestore(&d->lock, flags);
+		dev_kfree_skb(skb);
+		aoedev_put(d);
 	}
 }
+
+int __init
+aoecmd_init(void)
+{
+	INIT_LIST_HEAD(&iocq.head);
+	spin_lock_init(&iocq.lock);
+	init_waitqueue_head(&ktiowq);
+	kts.name = "aoe_ktio";
+	kts.fn = ktio;
+	kts.waitq = &ktiowq;
+	kts.lock = &iocq.lock;
+	return aoe_ktstart(&kts);
+}
+
+void
+aoecmd_exit(void)
+{
+	aoe_ktstop(&kts);
+	aoe_flush_iocq();
+}