From 289d1054e972b445fe8f7bbcbebf40b1bec37384 Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Thu, 7 Mar 2013 08:53:00 +1100
Subject: lguest: fix paths in comments

After commit 07fe997, lguest tool has already moved from
Documentation/virtual/lguest/ to tools/lguest/.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/Kconfig | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
 	---help---
 	  This is a very simple module which allows you to run
 	  multiple instances of the same Linux kernel, using the
-	  "lguest" command found in the Documentation/virtual/lguest
-	  directory.
+	  "lguest" command found in the tools/lguest directory.
 
 	  Note that "lguest" is pronounced to rhyme with "fell quest",
-	  not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
+	  not "rustyvisor". See tools/lguest/lguest.txt.
 
 	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
-- 
cgit v1.2.3


From ba06d1e1d3350a38476ea6b7655ba7c047baad67 Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Tue, 12 Mar 2013 15:34:40 +1030
Subject: virtio-scsi: use pr_err() instead of printk()

Convert the virtio-scsi driver to use pr_err() instead of printk().

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 3449a1f8c656..0f5dd2804ae5 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -13,6 +13,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mempool.h>
@@ -794,8 +796,7 @@ static int __init init(void)
 
 	virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
 	if (!virtscsi_cmd_cache) {
-		printk(KERN_ERR "kmem_cache_create() for "
-				"virtscsi_cmd_cache failed\n");
+		pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
 		goto error;
 	}
 
@@ -804,8 +805,7 @@ static int __init init(void)
 		mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
 					 virtscsi_cmd_cache);
 	if (!virtscsi_cmd_pool) {
-		printk(KERN_ERR "mempool_create() for"
-				"virtscsi_cmd_pool failed\n");
+		pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
 		goto error;
 	}
 	ret = register_virtio_driver(&virtio_scsi_driver);
-- 
cgit v1.2.3


From 9d9598b81c5c05495009e81ac0508ec8d1558015 Mon Sep 17 00:00:00 2001
From: Milos Vyletel <milos.vyletel@sde.cz>
Date: Tue, 12 Mar 2013 15:34:40 +1030
Subject: virtio-blk: emit udev event when device is resized

When virtio-blk device is resized from host (using block_resize from QEMU) emit
KOBJ_CHANGE uevent to notify guest about such change. This allows user to have
custom udev rules which would take whatever action if such event occurs. As a
proof of concept I've created simple udev rule that automatically resize
filesystem on virtio-blk device.

ACTION=="change", KERNEL=="vd*", \
        ENV{RESIZE}=="1", \
        ENV{ID_FS_TYPE}=="ext[3-4]", \
        RUN+="/sbin/resize2fs /dev/%k"
ACTION=="change", KERNEL=="vd*", \
        ENV{RESIZE}=="1", \
        ENV{ID_FS_TYPE}=="LVM2_member", \
        RUN+="/sbin/pvresize /dev/%k"

Signed-off-by: Milos Vyletel <milos.vyletel@sde.cz>
Tested-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (minor simplification)
---
 drivers/block/virtio_blk.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a25bc0d..922bcb97e23a 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -539,6 +539,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
 	struct virtio_device *vdev = vblk->vdev;
 	struct request_queue *q = vblk->disk->queue;
 	char cap_str_2[10], cap_str_10[10];
+	char *envp[] = { "RESIZE=1", NULL };
 	u64 capacity, size;
 
 	mutex_lock(&vblk->config_lock);
@@ -568,6 +569,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
 
 	set_capacity(vblk->disk, capacity);
 	revalidate_disk(vblk->disk);
+	kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
 done:
 	mutex_unlock(&vblk->config_lock);
 }
-- 
cgit v1.2.3


From 73640c991e2f2804939af70567b23e4c54b7c266 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 18 Mar 2013 13:22:18 +1030
Subject: tools/virtio: fix build for 3.8

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/vhost/test.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 91d6f060aade..329d3021d059 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -275,7 +275,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
 		return vhost_test_reset_owner(n);
 	default:
 		mutex_lock(&n->dev.mutex);
-		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+                if (r == -ENOIOCTLCMD)
+                        r = vhost_vring_ioctl(&n->dev, ioctl, argp);
 		vhost_test_flush(n);
 		mutex_unlock(&n->dev.mutex);
 		return r;
-- 
cgit v1.2.3


From a9a0fef779074838230e04a322fd2bdc921f4f4f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 18 Mar 2013 13:22:19 +1030
Subject: virtio_ring: expose virtio barriers for use in vringh.

The host side of ring needs this logic too.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 33 ++++++---------------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

(limited to 'drivers')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ffd7e7da5d3b..245177c286ae 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,27 +24,6 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
 
-/* virtio guest is communicating with a virtual "device" that actually runs on
- * a host processor.  Memory barriers are used to control SMP effects. */
-#ifdef CONFIG_SMP
-/* Where possible, use SMP barriers which are more lightweight than mandatory
- * barriers, because mandatory barriers control MMIO effects on accesses
- * through relaxed memory I/O windows (which virtio-pci does not use). */
-#define virtio_mb(vq) \
-	do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
-#define virtio_rmb(vq) \
-	do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
-#define virtio_wmb(vq) \
-	do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
-#else
-/* We must force memory ordering even if guest is UP since host could be
- * running on another CPU, but SMP barriers are defined to barrier() in that
- * configuration. So fall back to mandatory barriers instead. */
-#define virtio_mb(vq) mb()
-#define virtio_rmb(vq) rmb()
-#define virtio_wmb(vq) wmb()
-#endif
-
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt, args...)				\
@@ -276,7 +255,7 @@ add_head:
 
 	/* Descriptors and available array need to be set before we expose the
 	 * new available array entries. */
-	virtio_wmb(vq);
+	virtio_wmb(vq->weak_barriers);
 	vq->vring.avail->idx++;
 	vq->num_added++;
 
@@ -312,7 +291,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	START_USE(vq);
 	/* We need to expose available array entries before checking avail
 	 * event. */
-	virtio_mb(vq);
+	virtio_mb(vq->weak_barriers);
 
 	old = vq->vring.avail->idx - vq->num_added;
 	new = vq->vring.avail->idx;
@@ -436,7 +415,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	}
 
 	/* Only get used array entries after they have been exposed by host. */
-	virtio_rmb(vq);
+	virtio_rmb(vq->weak_barriers);
 
 	last_used = (vq->last_used_idx & (vq->vring.num - 1));
 	i = vq->vring.used->ring[last_used].id;
@@ -460,7 +439,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	 * the read in the next get_buf call. */
 	if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
 		vring_used_event(&vq->vring) = vq->last_used_idx;
-		virtio_mb(vq);
+		virtio_mb(vq->weak_barriers);
 	}
 
 #ifdef DEBUG
@@ -513,7 +492,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
 	 * entry. Always do both to keep code simple. */
 	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
 	vring_used_event(&vq->vring) = vq->last_used_idx;
-	virtio_mb(vq);
+	virtio_mb(vq->weak_barriers);
 	if (unlikely(more_used(vq))) {
 		END_USE(vq);
 		return false;
@@ -553,7 +532,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 	/* TODO: tune this threshold */
 	bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
 	vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
-	virtio_mb(vq);
+	virtio_mb(vq->weak_barriers);
 	if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
 		END_USE(vq);
 		return false;
-- 
cgit v1.2.3


From f87d0fbb579818fed3eeb0923cc253163ab93039 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 13:50:14 +1030
Subject: vringh: host-side implementation of virtio rings.

Getting use of virtio rings correct is tricky, and a recent patch saw
an implementation of in-kernel rings (as separate from userspace).

This abstracts the business of dealing with the virtio ring layout
from the access (userspace or direct); to do this, we use function
pointers, which gcc inlines correctly.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/Makefile          |    2 +-
 drivers/vhost/Kconfig     |    8 +
 drivers/vhost/Kconfig.tcm |    1 +
 drivers/vhost/Makefile    |    2 +
 drivers/vhost/vringh.c    | 1007 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1019 insertions(+), 1 deletion(-)
 create mode 100644 drivers/vhost/vringh.c

(limited to 'drivers')

diff --git a/drivers/Makefile b/drivers/Makefile
index dce39a95fa71..72d28d34ee24 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -123,7 +123,7 @@ obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
 obj-$(CONFIG_BCMA)		+= bcma/
-obj-$(CONFIG_VHOST_NET)		+= vhost/
+obj-$(CONFIG_VHOST_RING)	+= vhost/
 obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index bf243177ffe1..85b773a93a5d 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
 config VHOST_NET
 	tristate "Host kernel accelerator for virtio net"
 	depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
+	select VHOST_RING
 	---help---
 	  This kernel module can be loaded in host kernel to accelerate
 	  guest networking with virtio_net. Not to be confused with virtio_net
@@ -12,3 +13,10 @@ config VHOST_NET
 if STAGING
 source "drivers/vhost/Kconfig.tcm"
 endif
+
+config VHOST_RING
+	tristate
+	---help---
+	  This option is selected by any driver which needs to access
+	  the host side of a virtio ring.
+
diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm
index 7e3aa28d999e..c3a8cfa1de72 100644
--- a/drivers/vhost/Kconfig.tcm
+++ b/drivers/vhost/Kconfig.tcm
@@ -1,6 +1,7 @@
 config TCM_VHOST
 	tristate "TCM_VHOST fabric module"
 	depends on TARGET_CORE && EVENTFD && m
+	select VHOST_RING
 	default n
 	---help---
 	Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053bc9ab..1d37f5e12be6 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
 
 obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+
+obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 000000000000..bff0775e258c
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
+/*
+ * Helpers for the host side of a virtio ring.
+ *
+ * Since these may be in userspace, we use (inline) accessors.
+ */
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
+{
+	static DEFINE_RATELIMIT_STATE(vringh_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	if (__ratelimit(&vringh_rs)) {
+		va_list ap;
+		va_start(ap, fmt);
+		printk(KERN_NOTICE "vringh:");
+		vprintk(fmt, ap);
+		va_end(ap);
+	}
+}
+
+/* Returns vring->num if empty, -ve on error. */
+static inline int __vringh_get_head(const struct vringh *vrh,
+				    int (*getu16)(u16 *val, const u16 *p),
+				    u16 *last_avail_idx)
+{
+	u16 avail_idx, i, head;
+	int err;
+
+	err = getu16(&avail_idx, &vrh->vring.avail->idx);
+	if (err) {
+		vringh_bad("Failed to access avail idx at %p",
+			   &vrh->vring.avail->idx);
+		return err;
+	}
+
+	if (*last_avail_idx == avail_idx)
+		return vrh->vring.num;
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	virtio_rmb(vrh->weak_barriers);
+
+	i = *last_avail_idx & (vrh->vring.num - 1);
+
+	err = getu16(&head, &vrh->vring.avail->ring[i]);
+	if (err) {
+		vringh_bad("Failed to read head: idx %d address %p",
+			   *last_avail_idx, &vrh->vring.avail->ring[i]);
+		return err;
+	}
+
+	if (head >= vrh->vring.num) {
+		vringh_bad("Guest says index %u > %u is available",
+			   head, vrh->vring.num);
+		return -EINVAL;
+	}
+
+	(*last_avail_idx)++;
+	return head;
+}
+
+/* Copy some bytes to/from the iovec.  Returns num copied. */
+static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+				      void *ptr, size_t len,
+				      int (*xfer)(void *addr, void *ptr,
+						  size_t len))
+{
+	int err, done = 0;
+
+	while (len && iov->i < iov->used) {
+		size_t partlen;
+
+		partlen = min(iov->iov[iov->i].iov_len, len);
+		err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+		if (err)
+			return err;
+		done += partlen;
+		len -= partlen;
+		ptr += partlen;
+		iov->consumed += partlen;
+		iov->iov[iov->i].iov_len -= partlen;
+		iov->iov[iov->i].iov_base += partlen;
+
+		if (!iov->iov[iov->i].iov_len) {
+			/* Fix up old iov element then increment. */
+			iov->iov[iov->i].iov_len = iov->consumed;
+			iov->iov[iov->i].iov_base -= iov->consumed;
+			
+			iov->consumed = 0;
+			iov->i++;
+		}
+	}
+	return done;
+}
+
+/* May reduce *len if range is shorter. */
+static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
+			       struct vringh_range *range,
+			       bool (*getrange)(struct vringh *,
+						u64, struct vringh_range *))
+{
+	if (addr < range->start || addr > range->end_incl) {
+		if (!getrange(vrh, addr, range))
+			return false;
+	}
+	BUG_ON(addr < range->start || addr > range->end_incl);
+
+	/* To end of memory? */
+	if (unlikely(addr + *len == 0)) {
+		if (range->end_incl == -1ULL)
+			return true;
+		goto truncate;
+	}
+
+	/* Otherwise, don't wrap. */
+	if (addr + *len < addr) {
+		vringh_bad("Wrapping descriptor %zu@0x%llx",
+			   *len, (unsigned long long)addr);
+		return false;
+	}
+
+	if (unlikely(addr + *len - 1 > range->end_incl))
+		goto truncate;
+	return true;
+
+truncate:
+	*len = range->end_incl + 1 - addr;
+	return true;
+}
+
+static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
+				  struct vringh_range *range,
+				  bool (*getrange)(struct vringh *,
+						   u64, struct vringh_range *))
+{
+	return true;
+}
+
+/* No reason for this code to be inline. */
+static int move_to_indirect(int *up_next, u16 *i, void *addr,
+			    const struct vring_desc *desc,
+			    struct vring_desc **descs, int *desc_max)
+{
+	/* Indirect tables can't have indirect. */
+	if (*up_next != -1) {
+		vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
+		return -EINVAL;
+	}
+
+	if (unlikely(desc->len % sizeof(struct vring_desc))) {
+		vringh_bad("Strange indirect len %u", desc->len);
+		return -EINVAL;
+	}
+
+	/* We will check this when we follow it! */
+	if (desc->flags & VRING_DESC_F_NEXT)
+		*up_next = desc->next;
+	else
+		*up_next = -2;
+	*descs = addr;
+	*desc_max = desc->len / sizeof(struct vring_desc);
+
+	/* Now, start at the first indirect. */
+	*i = 0;
+	return 0;
+}
+
+static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
+{
+	struct kvec *new;
+	unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
+
+	if (new_num < 8)
+		new_num = 8;
+
+	flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
+	if (flag)
+		new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
+	else {
+		new = kmalloc(new_num * sizeof(struct iovec), gfp);
+		if (new) {
+			memcpy(new, iov->iov,
+			       iov->max_num * sizeof(struct iovec));
+			flag = VRINGH_IOV_ALLOCATED;
+		}
+	}
+	if (!new)
+		return -ENOMEM;
+	iov->iov = new;
+	iov->max_num = (new_num | flag);
+	return 0;
+}
+
+static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
+				       struct vring_desc **descs, int *desc_max)
+{
+	u16 i = *up_next;
+
+	*up_next = -1;
+	*descs = vrh->vring.desc;
+	*desc_max = vrh->vring.num;
+	return i;
+}
+
+static int slow_copy(struct vringh *vrh, void *dst, const void *src,
+		     bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+				    struct vringh_range *range,
+				    bool (*getrange)(struct vringh *vrh,
+						     u64,
+						     struct vringh_range *)),
+		     bool (*getrange)(struct vringh *vrh,
+				      u64 addr,
+				      struct vringh_range *r),
+		     struct vringh_range *range,
+		     int (*copy)(void *dst, const void *src, size_t len))
+{
+	size_t part, len = sizeof(struct vring_desc);
+
+	do {
+		u64 addr;
+		int err;
+
+		part = len;
+		addr = (u64)(unsigned long)src - range->offset;
+
+		if (!rcheck(vrh, addr, &part, range, getrange))
+			return -EINVAL;
+
+		err = copy(dst, src, part);
+		if (err)
+			return err;
+
+		dst += part;
+		src += part;
+		len -= part;
+	} while (len);
+	return 0;
+}
+
+static inline int
+__vringh_iov(struct vringh *vrh, u16 i,
+	     struct vringh_kiov *riov,
+	     struct vringh_kiov *wiov,
+	     bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+			    struct vringh_range *range,
+			    bool (*getrange)(struct vringh *, u64,
+					     struct vringh_range *)),
+	     bool (*getrange)(struct vringh *, u64, struct vringh_range *),
+	     gfp_t gfp,
+	     int (*copy)(void *dst, const void *src, size_t len))
+{
+	int err, count = 0, up_next, desc_max;
+	struct vring_desc desc, *descs;
+	struct vringh_range range = { -1ULL, 0 }, slowrange;
+	bool slow = false;
+
+	/* We start traversing vring's descriptor table. */
+	descs = vrh->vring.desc;
+	desc_max = vrh->vring.num;
+	up_next = -1;
+
+	if (riov)
+		riov->i = riov->used = 0;
+	else if (wiov)
+		wiov->i = wiov->used = 0;
+	else
+		/* You must want something! */
+		BUG();
+
+	for (;;) {
+		void *addr;
+		struct vringh_kiov *iov;
+		size_t len;
+
+		if (unlikely(slow))
+			err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
+					&slowrange, copy);
+		else
+			err = copy(&desc, &descs[i], sizeof(desc));
+		if (unlikely(err))
+			goto fail;
+
+		if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+			/* Make sure it's OK, and get offset. */
+			len = desc.len;
+			if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+				err = -EINVAL;
+				goto fail;
+			}
+
+			if (unlikely(len != desc.len)) {
+				slow = true;
+				/* We need to save this range to use offset */
+				slowrange = range;
+			}
+
+			addr = (void *)(long)(desc.addr + range.offset);
+			err = move_to_indirect(&up_next, &i, addr, &desc,
+					       &descs, &desc_max);
+			if (err)
+				goto fail;
+			continue;
+		}
+
+		if (count++ == vrh->vring.num) {
+			vringh_bad("Descriptor loop in %p", descs);
+			err = -ELOOP;
+			goto fail;
+		}
+
+		if (desc.flags & VRING_DESC_F_WRITE)
+			iov = wiov;
+		else {
+			iov = riov;
+			if (unlikely(wiov && wiov->i)) {
+				vringh_bad("Readable desc %p after writable",
+					   &descs[i]);
+				err = -EINVAL;
+				goto fail;
+			}
+		}
+
+		if (!iov) {
+			vringh_bad("Unexpected %s desc",
+				   !wiov ? "writable" : "readable");
+			err = -EPROTO;
+			goto fail;
+		}
+
+	again:
+		/* Make sure it's OK, and get offset. */
+		len = desc.len;
+		if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+			err = -EINVAL;
+			goto fail;
+		}
+		addr = (void *)(unsigned long)(desc.addr + range.offset);
+
+		if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
+			err = resize_iovec(iov, gfp);
+			if (err)
+				goto fail;
+		}
+
+		iov->iov[iov->used].iov_base = addr;
+		iov->iov[iov->used].iov_len = len;
+		iov->used++;
+
+		if (unlikely(len != desc.len)) {
+			desc.len -= len;
+			desc.addr += len;
+			goto again;
+		}
+
+		if (desc.flags & VRING_DESC_F_NEXT) {
+			i = desc.next;
+		} else {
+			/* Just in case we need to finish traversing above. */
+			if (unlikely(up_next > 0)) {
+				i = return_from_indirect(vrh, &up_next,
+							 &descs, &desc_max);
+				slow = false;
+			} else
+				break;
+		}
+
+		if (i >= desc_max) {
+			vringh_bad("Chained index %u > %u", i, desc_max);
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+
+	return 0;
+
+fail:
+	return err;
+}
+
+static inline int __vringh_complete(struct vringh *vrh,
+				    const struct vring_used_elem *used,
+				    unsigned int num_used,
+				    int (*putu16)(u16 *p, u16 val),
+				    int (*putused)(struct vring_used_elem *dst,
+						   const struct vring_used_elem
+						   *src, unsigned num))
+{
+	struct vring_used *used_ring;
+	int err;
+	u16 used_idx, off;
+
+	used_ring = vrh->vring.used;
+	used_idx = vrh->last_used_idx + vrh->completed;
+
+	off = used_idx % vrh->vring.num;
+
+	/* Compiler knows num_used == 1 sometimes, hence extra check */
+	if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
+		u16 part = vrh->vring.num - off;
+		err = putused(&used_ring->ring[off], used, part);
+		if (!err)
+			err = putused(&used_ring->ring[0], used + part,
+				      num_used - part);
+	} else
+		err = putused(&used_ring->ring[off], used, num_used);
+
+	if (err) {
+		vringh_bad("Failed to write %u used entries %u at %p",
+			   num_used, off, &used_ring->ring[off]);
+		return err;
+	}
+
+	/* Make sure buffer is written before we update index. */
+	virtio_wmb(vrh->weak_barriers);
+
+	err = putu16(&vrh->vring.used->idx, used_idx + num_used);
+	if (err) {
+		vringh_bad("Failed to update used index at %p",
+			   &vrh->vring.used->idx);
+		return err;
+	}
+
+	vrh->completed += num_used;
+	return 0;
+}
+
+
+static inline int __vringh_need_notify(struct vringh *vrh,
+				       int (*getu16)(u16 *val, const u16 *p))
+{
+	bool notify;
+	u16 used_event;
+	int err;
+
+	/* Flush out used index update. This is paired with the
+	 * barrier that the Guest executes when enabling
+	 * interrupts. */
+	virtio_mb(vrh->weak_barriers);
+
+	/* Old-style, without event indices. */
+	if (!vrh->event_indices) {
+		u16 flags;
+		err = getu16(&flags, &vrh->vring.avail->flags);
+		if (err) {
+			vringh_bad("Failed to get flags at %p",
+				   &vrh->vring.avail->flags);
+			return err;
+		}
+		return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
+	}
+
+	/* Modern: we know when other side wants to know. */
+	err = getu16(&used_event, &vring_used_event(&vrh->vring));
+	if (err) {
+		vringh_bad("Failed to get used event idx at %p",
+			   &vring_used_event(&vrh->vring));
+		return err;
+	}
+
+	/* Just in case we added so many that we wrap. */
+	if (unlikely(vrh->completed > 0xffff))
+		notify = true;
+	else
+		notify = vring_need_event(used_event,
+					  vrh->last_used_idx + vrh->completed,
+					  vrh->last_used_idx);
+
+	vrh->last_used_idx += vrh->completed;
+	vrh->completed = 0;
+	return notify;
+}
+
+static inline bool __vringh_notify_enable(struct vringh *vrh,
+					  int (*getu16)(u16 *val, const u16 *p),
+					  int (*putu16)(u16 *p, u16 val))
+{
+	u16 avail;
+
+	if (!vrh->event_indices) {
+		/* Old-school; update flags. */
+		if (putu16(&vrh->vring.used->flags, 0) != 0) {
+			vringh_bad("Clearing used flags %p",
+				   &vrh->vring.used->flags);
+			return true;
+		}
+	} else {
+		if (putu16(&vring_avail_event(&vrh->vring),
+			   vrh->last_avail_idx) != 0) {
+			vringh_bad("Updating avail event index %p",
+				   &vring_avail_event(&vrh->vring));
+			return true;
+		}
+	}
+
+	/* They could have slipped one in as we were doing that: make
+	 * sure it's written, then check again. */
+	virtio_mb(vrh->weak_barriers);
+
+	if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
+		vringh_bad("Failed to check avail idx at %p",
+			   &vrh->vring.avail->idx);
+		return true;
+	}
+
+	/* This is unlikely, so we just leave notifications enabled
+	 * (if we're using event_indices, we'll only get one
+	 * notification anyway). */
+	return avail == vrh->last_avail_idx;
+}
+
+static inline void __vringh_notify_disable(struct vringh *vrh,
+					   int (*putu16)(u16 *p, u16 val))
+{
+	if (!vrh->event_indices) {
+		/* Old-school; update flags. */
+		if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
+			vringh_bad("Setting used flags %p",
+				   &vrh->vring.used->flags);
+		}
+	}
+}
+
+/* Userspace access helpers: in this case, addresses are really userspace. */
+static inline int getu16_user(u16 *val, const u16 *p)
+{
+	return get_user(*val, (__force u16 __user *)p);
+}
+
+static inline int putu16_user(u16 *p, u16 val)
+{
+	return put_user(val, (__force u16 __user *)p);
+}
+
+static inline int copydesc_user(void *dst, const void *src, size_t len)
+{
+	return copy_from_user(dst, (__force void __user *)src, len) ?
+		-EFAULT : 0;
+}
+
+static inline int putused_user(struct vring_used_elem *dst,
+			       const struct vring_used_elem *src,
+			       unsigned int num)
+{
+	return copy_to_user((__force void __user *)dst, src,
+			    sizeof(*dst) * num) ? -EFAULT : 0;
+}
+
+static inline int xfer_from_user(void *src, void *dst, size_t len)
+{
+	return copy_from_user(dst, (__force void __user *)src, len) ?
+		-EFAULT : 0;
+}
+
+static inline int xfer_to_user(void *dst, void *src, size_t len)
+{
+	return copy_to_user((__force void __user *)dst, src, len) ?
+		-EFAULT : 0;
+}
+
+/**
+ * vringh_init_user - initialize a vringh for a userspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid: you should check pointers
+ * yourself!
+ */
+int vringh_init_user(struct vringh *vrh, u32 features,
+		     unsigned int num, bool weak_barriers,
+		     struct vring_desc __user *desc,
+		     struct vring_avail __user *avail,
+		     struct vring_used __user *used)
+{
+	/* Sane power of 2 please! */
+	if (!num || num > 0xffff || (num & (num - 1))) {
+		vringh_bad("Bad ring size %u", num);
+		return -EINVAL;
+	}
+
+	vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+	vrh->weak_barriers = weak_barriers;
+	vrh->completed = 0;
+	vrh->last_avail_idx = 0;
+	vrh->last_used_idx = 0;
+	vrh->vring.num = num;
+	/* vring expects kernel addresses, but only used via accessors. */
+	vrh->vring.desc = (__force struct vring_desc *)desc;
+	vrh->vring.avail = (__force struct vring_avail *)avail;
+	vrh->vring.used = (__force struct vring_used *)used;
+	return 0;
+}
+EXPORT_SYMBOL(vringh_init_user);
+
+/**
+ * vringh_getdesc_user - get next available descriptor from userspace ring.
+ * @vrh: the userspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @getrange: function to call to check ranges.
+ * @head: head index we received, for passing to vringh_complete_user().
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_user(struct vringh *vrh,
+			struct vringh_iov *riov,
+			struct vringh_iov *wiov,
+			bool (*getrange)(struct vringh *vrh,
+					 u64 addr, struct vringh_range *r),
+			u16 *head)
+{
+	int err;
+
+	*head = vrh->vring.num;
+	err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
+	if (err < 0)
+		return err;
+
+	/* Empty... */
+	if (err == vrh->vring.num)
+		return 0;
+
+	/* We need the layouts to be the identical for this to work */
+	BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
+		     offsetof(struct vringh_iov, iov));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
+		     offsetof(struct vringh_iov, i));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
+		     offsetof(struct vringh_iov, used));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
+		     offsetof(struct vringh_iov, max_num));
+	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
+	BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
+		     offsetof(struct kvec, iov_base));
+	BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
+		     offsetof(struct kvec, iov_len));
+	BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
+		     != sizeof(((struct kvec *)NULL)->iov_base));
+	BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
+		     != sizeof(((struct kvec *)NULL)->iov_len));
+
+	*head = err;
+	err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
+			   (struct vringh_kiov *)wiov,
+			   range_check, getrange, GFP_KERNEL, copydesc_user);
+	if (err)
+		return err;
+
+	return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_user);
+
+/**
+ * vringh_iov_pull_user - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
+{
+	return vringh_iov_xfer((struct vringh_kiov *)riov,
+			       dst, len, xfer_from_user);
+}
+EXPORT_SYMBOL(vringh_iov_pull_user);
+
+/**
+ * vringh_iov_push_user - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+			     const void *src, size_t len)
+{
+	return vringh_iov_xfer((struct vringh_kiov *)wiov,
+			       (void *)src, len, xfer_to_user);
+}
+EXPORT_SYMBOL(vringh_iov_push_user);
+
+/**
+ * vringh_abandon_user - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *	 vringh_get_user() to undo).
+ *
+ * The next vringh_get_user() will return the old descriptor(s) again.
+ */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num)
+{
+	/* We only update vring_avail_event(vr) when we want to be notified,
+	 * so we haven't changed that yet. */
+	vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_user);
+
+/**
+ * vringh_complete_user - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_user.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
+{
+	struct vring_used_elem used;
+
+	used.id = head;
+	used.len = len;
+	return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_user);
+
+/**
+ * vringh_complete_multi_user - we've finished with many descriptors.
+ * @vrh: the vring.
+ * @used: the head, length pairs.
+ * @num_used: the number of used elements.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_multi_user(struct vringh *vrh,
+			       const struct vring_used_elem used[],
+			       unsigned num_used)
+{
+	return __vringh_complete(vrh, used, num_used,
+				 putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_multi_user);
+
+/**
+ * vringh_notify_enable_user - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_user(struct vringh *vrh)
+{
+	return __vringh_notify_enable(vrh, getu16_user, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_enable_user);
+
+/**
+ * vringh_notify_disable_user - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_user(struct vringh *vrh)
+{
+	__vringh_notify_disable(vrh, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_disable_user);
+
+/**
+ * vringh_need_notify_user - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_user() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_user(struct vringh *vrh)
+{
+	return __vringh_need_notify(vrh, getu16_user);
+}
+EXPORT_SYMBOL(vringh_need_notify_user);
+
+/* Kernelspace access helpers. */
+static inline int getu16_kern(u16 *val, const u16 *p)
+{
+	*val = ACCESS_ONCE(*p);
+	return 0;
+}
+
+static inline int putu16_kern(u16 *p, u16 val)
+{
+	ACCESS_ONCE(*p) = val;
+	return 0;
+}
+
+static inline int copydesc_kern(void *dst, const void *src, size_t len)
+{
+	memcpy(dst, src, len);
+	return 0;
+}
+
+static inline int putused_kern(struct vring_used_elem *dst,
+			       const struct vring_used_elem *src,
+			       unsigned int num)
+{
+	memcpy(dst, src, num * sizeof(*dst));
+	return 0;
+}
+
+static inline int xfer_kern(void *src, void *dst, size_t len)
+{
+	memcpy(dst, src, len);
+	return 0;
+}
+
+/**
+ * vringh_init_kern - initialize a vringh for a kernelspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+		     unsigned int num, bool weak_barriers,
+		     struct vring_desc *desc,
+		     struct vring_avail *avail,
+		     struct vring_used *used)
+{
+	/* Sane power of 2 please! */
+	if (!num || num > 0xffff || (num & (num - 1))) {
+		vringh_bad("Bad ring size %u", num);
+		return -EINVAL;
+	}
+
+	vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+	vrh->weak_barriers = weak_barriers;
+	vrh->completed = 0;
+	vrh->last_avail_idx = 0;
+	vrh->last_used_idx = 0;
+	vrh->vring.num = num;
+	vrh->vring.desc = desc;
+	vrh->vring.avail = avail;
+	vrh->vring.used = used;
+	return 0;
+}
+EXPORT_SYMBOL(vringh_init_kern);
+
+/**
+ * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_kern().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_kern(struct vringh *vrh,
+			struct vringh_kiov *riov,
+			struct vringh_kiov *wiov,
+			u16 *head,
+			gfp_t gfp)
+{
+	int err;
+
+	err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
+	if (err < 0)
+		return err;
+
+	/* Empty... */
+	if (err == vrh->vring.num)
+		return 0;
+
+	*head = err;
+	err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+			   gfp, copydesc_kern);
+	if (err)
+		return err;
+
+	return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_kern);
+
+/**
+ * vringh_iov_pull_kern - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
+{
+	return vringh_iov_xfer(riov, dst, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_pull_kern);
+
+/**
+ * vringh_iov_push_kern - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+			     const void *src, size_t len)
+{
+	return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_push_kern);
+
+/**
+ * vringh_abandon_kern - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *	 vringh_get_kern() to undo).
+ *
+ * The next vringh_get_kern() will return the old descriptor(s) again.
+ */
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
+{
+	/* We only update vring_avail_event(vr) when we want to be notified,
+	 * so we haven't changed that yet. */
+	vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_kern);
+
+/**
+ * vringh_complete_kern - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_kern.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_kern() after one or more calls
+ * to this function.
+ */
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
+{
+	struct vring_used_elem used;
+
+	used.id = head;
+	used.len = len;
+
+	return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
+}
+EXPORT_SYMBOL(vringh_complete_kern);
+
+/**
+ * vringh_notify_enable_kern - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_kern(struct vringh *vrh)
+{
+	return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_enable_kern);
+
+/**
+ * vringh_notify_disable_kern - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_kern(struct vringh *vrh)
+{
+	__vringh_notify_disable(vrh, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_disable_kern);
+
+/**
+ * vringh_need_notify_kern - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_kern() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_kern(struct vringh *vrh)
+{
+	return __vringh_need_notify(vrh, getu16_kern);
+}
+EXPORT_SYMBOL(vringh_need_notify_kern);
-- 
cgit v1.2.3


From 0d2e1a2926b1839a4b74519e660739b2566c9386 Mon Sep 17 00:00:00 2001
From: Erwan Yvin <erwan.yvin@stericsson.com>
Date: Wed, 20 Mar 2013 13:52:24 +1030
Subject: caif_virtio: Introduce caif over virtio

Add the CAIF Virtio shared memory driver for talking
to a modem.

This CAIF Link layer communicates to the modem over
shared memory. It is implemented as a virtio_driver.
The underlying virtio device is managed by the remoteproc
framework. The Virtio queue is used for transmitting data
to the modem, and the new vringh is used for receiving data.

Genalloc is used for managing the shared memory used for TX
data. The default dma-alloc-coherent allocator can only
allocate whole pages, and this wastes too much shared memory.

Flow control is implemented by stopping the TX-queues if the
virtio queues go full or we run out of memory. Queued are
reopened when queues are below the watermark.

NAPI is used in RX path, and a dedicated tasklet is used
for releasing TX buffers.

Signed-off-by: Erwan Yvin <erwan.yvin@stericsson.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (minor fixes)
---
 drivers/net/caif/Kconfig       |  14 +
 drivers/net/caif/Makefile      |   3 +
 drivers/net/caif/caif_virtio.c | 785 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 802 insertions(+)
 create mode 100644 drivers/net/caif/caif_virtio.c

(limited to 'drivers')

diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index 60c2142373c9..893f9154011e 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -47,3 +47,17 @@ config CAIF_HSI
        The caif low level driver for CAIF over HSI.
        Be aware that if you enable this then you also need to
        enable a low-level HSI driver.
+
+config CAIF_VIRTIO
+	tristate "CAIF virtio transport driver"
+	depends on CAIF
+	select VHOST_RING
+	select VIRTIO
+	select GENERIC_ALLOCATOR
+	default n
+	---help---
+	The caif driver for CAIF over Virtio.
+
+if CAIF_VIRTIO
+source "drivers/vhost/Kconfig"
+endif
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index 91dff861560f..d9ee26a96c6e 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -13,3 +13,6 @@ obj-$(CONFIG_CAIF_SHM) += caif_shm.o
 
 # HSI interface
 obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
+
+# Virtio interface
+obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
new file mode 100644
index 000000000000..b1e1205e4e28
--- /dev/null
+++ b/drivers/net/caif/caif_virtio.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2013
+ * Authors: Vicram Arv / vikram.arv@stericsson.com,
+ *	    Dmitry Tarnyagin / dmitry.tarnyagin@stericsson.com
+ *	    Sjur Brendeland / sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_caif.h>
+#include <linux/virtio_ring.h>
+#include <linux/dma-mapping.h>
+#include <net/caif/caif_dev.h>
+#include <linux/virtio_config.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vicram Arv <vikram.arv@stericsson.com>");
+MODULE_AUTHOR("Sjur Brendeland <sjur.brandeland@stericsson.com>");
+MODULE_DESCRIPTION("Virtio CAIF Driver");
+
+/* NAPI schedule quota */
+#define CFV_DEFAULT_QUOTA 32
+
+/* Defaults used if virtio config space is unavailable */
+#define CFV_DEF_MTU_SIZE 4096
+#define CFV_DEF_HEADROOM 32
+#define CFV_DEF_TAILROOM 32
+
+/* Required IP header alignment */
+#define IP_HDR_ALIGN 4
+
+/* struct cfv_napi_contxt - NAPI context info
+ * @riov: IOV holding data read from the ring. Note that riov may
+ *	  still hold data when cfv_rx_poll() returns.
+ * @head: Last descriptor ID we received from vringh_getdesc_kern.
+ *	  We use this to put descriptor back on the used ring. USHRT_MAX is
+ *	  used to indicate invalid head-id.
+ */
+struct cfv_napi_context {
+	struct vringh_kiov riov;
+	unsigned short head;
+};
+
+/* struct cfv_stats - statistics for debugfs
+ * @rx_napi_complete:	Number of NAPI completions (RX)
+ * @rx_napi_resched:	Number of calls where the full quota was used (RX)
+ * @rx_nomem:		Number of SKB alloc failures (RX)
+ * @rx_kicks:		Number of RX kicks
+ * @tx_full_ring:	Number times TX ring was full
+ * @tx_no_mem:		Number of times TX went out of memory
+ * @tx_flow_on:		Number of flow on (TX)
+ * @tx_kicks:		Number of TX kicks
+ */
+struct cfv_stats {
+	u32 rx_napi_complete;
+	u32 rx_napi_resched;
+	u32 rx_nomem;
+	u32 rx_kicks;
+	u32 tx_full_ring;
+	u32 tx_no_mem;
+	u32 tx_flow_on;
+	u32 tx_kicks;
+};
+
+/* struct cfv_info - Caif Virtio control structure
+ * @cfdev:	caif common header
+ * @vdev:	Associated virtio device
+ * @vr_rx:	rx/downlink host vring
+ * @vq_tx:	tx/uplink virtqueue
+ * @ndev:	CAIF link layer device
+ * @watermark_tx: indicates number of free descriptors we need
+ *		to reopen the tx-queues after overload.
+ * @tx_lock:	protects vq_tx from concurrent use
+ * @tx_release_tasklet: Tasklet for freeing consumed TX buffers
+ * @napi:       Napi context used in cfv_rx_poll()
+ * @ctx:        Context data used in cfv_rx_poll()
+ * @tx_hr:	transmit headroom
+ * @rx_hr:	receive headroom
+ * @tx_tr:	transmit tail room
+ * @rx_tr:	receive tail room
+ * @mtu:	transmit max size
+ * @mru:	receive max size
+ * @allocsz:    size of dma memory reserved for TX buffers
+ * @alloc_addr: virtual address to dma memory for TX buffers
+ * @alloc_dma:  dma address to dma memory for TX buffers
+ * @genpool:    Gen Pool used for allocating TX buffers
+ * @reserved_mem: Pointer to memory reserve allocated from genpool
+ * @reserved_size: Size of memory reserve allocated from genpool
+ * @stats:       Statistics exposed in sysfs
+ * @debugfs:    Debugfs dentry for statistic counters
+ */
+struct cfv_info {
+	struct caif_dev_common cfdev;
+	struct virtio_device *vdev;
+	struct vringh *vr_rx;
+	struct virtqueue *vq_tx;
+	struct net_device *ndev;
+	unsigned int watermark_tx;
+	/* Protect access to vq_tx */
+	spinlock_t tx_lock;
+	struct tasklet_struct tx_release_tasklet;
+	struct napi_struct napi;
+	struct cfv_napi_context ctx;
+	u16 tx_hr;
+	u16 rx_hr;
+	u16 tx_tr;
+	u16 rx_tr;
+	u32 mtu;
+	u32 mru;
+	size_t allocsz;
+	void *alloc_addr;
+	dma_addr_t alloc_dma;
+	struct gen_pool *genpool;
+	unsigned long reserved_mem;
+	size_t reserved_size;
+	struct cfv_stats stats;
+	struct dentry *debugfs;
+};
+
+/* struct buf_info - maintains transmit buffer data handle
+ * @size:	size of transmit buffer
+ * @dma_handle: handle to allocated dma device memory area
+ * @vaddr:	virtual address mapping to allocated memory area
+ */
+struct buf_info {
+	size_t size;
+	u8 *vaddr;
+};
+
+/* Called from virtio device, in IRQ context */
+static void cfv_release_cb(struct virtqueue *vq_tx)
+{
+	struct cfv_info *cfv = vq_tx->vdev->priv;
+
+	++cfv->stats.tx_kicks;
+	tasklet_schedule(&cfv->tx_release_tasklet);
+}
+
+static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
+{
+	if (!buf_info)
+		return;
+	gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
+		      buf_info->size);
+	kfree(buf_info);
+}
+
+/* This is invoked whenever the remote processor completed processing
+ * a TX msg we just sent, and the buffer is put back to the used ring.
+ */
+static void cfv_release_used_buf(struct virtqueue *vq_tx)
+{
+	struct cfv_info *cfv = vq_tx->vdev->priv;
+	unsigned long flags;
+
+	BUG_ON(vq_tx != cfv->vq_tx);
+
+	for (;;) {
+		unsigned int len;
+		struct buf_info *buf_info;
+
+		/* Get used buffer from used ring to recycle used descriptors */
+		spin_lock_irqsave(&cfv->tx_lock, flags);
+		buf_info = virtqueue_get_buf(vq_tx, &len);
+		spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+		/* Stop looping if there are no more buffers to free */
+		if (!buf_info)
+			break;
+
+		free_buf_info(cfv, buf_info);
+
+		/* watermark_tx indicates if we previously stopped the tx
+		 * queues. If we have enough free stots in the virtio ring,
+		 * re-establish memory reserved and open up tx queues.
+		 */
+		if (cfv->vq_tx->num_free <= cfv->watermark_tx)
+			continue;
+
+		/* Re-establish memory reserve */
+		if (cfv->reserved_mem == 0 && cfv->genpool)
+			cfv->reserved_mem =
+				gen_pool_alloc(cfv->genpool,
+					       cfv->reserved_size);
+
+		/* Open up the tx queues */
+		if (cfv->reserved_mem) {
+			cfv->watermark_tx =
+				virtqueue_get_vring_size(cfv->vq_tx);
+			netif_tx_wake_all_queues(cfv->ndev);
+			/* Buffers are recycled in cfv_netdev_tx, so
+			 * disable notifications when queues are opened.
+			 */
+			virtqueue_disable_cb(cfv->vq_tx);
+			++cfv->stats.tx_flow_on;
+		} else {
+			/* if no memory reserve, wait for more free slots */
+			WARN_ON(cfv->watermark_tx >
+			       virtqueue_get_vring_size(cfv->vq_tx));
+			cfv->watermark_tx +=
+				virtqueue_get_vring_size(cfv->vq_tx) / 4;
+		}
+	}
+}
+
+/* Allocate a SKB and copy packet data to it */
+static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
+					      struct cfv_info *cfv,
+					      u8 *frm, u32 frm_len)
+{
+	struct sk_buff *skb;
+	u32 cfpkt_len, pad_len;
+
+	*err = 0;
+	/* Verify that packet size with down-link header and mtu size */
+	if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
+		netdev_err(cfv->ndev,
+			   "Invalid frmlen:%u  mtu:%u hr:%d tr:%d\n",
+			   frm_len, cfv->mru,  cfv->rx_hr,
+			   cfv->rx_tr);
+		*err = -EPROTO;
+		return NULL;
+	}
+
+	cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
+	pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
+
+	skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
+	if (!skb) {
+		*err = -ENOMEM;
+		return NULL;
+	}
+
+	skb_reserve(skb, cfv->rx_hr + pad_len);
+
+	memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
+	return skb;
+}
+
+/* Get packets from the host vring */
+static int cfv_rx_poll(struct napi_struct *napi, int quota)
+{
+	struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
+	int rxcnt = 0;
+	int err = 0;
+	void *buf;
+	struct sk_buff *skb;
+	struct vringh_kiov *riov = &cfv->ctx.riov;
+	unsigned int skb_len;
+
+again:
+	do {
+		skb = NULL;
+
+		/* Put the previous iovec back on the used ring and
+		 * fetch a new iovec if we have processed all elements.
+		 */
+		if (riov->i == riov->used) {
+			if (cfv->ctx.head != USHRT_MAX) {
+				vringh_complete_kern(cfv->vr_rx,
+						     cfv->ctx.head,
+						     0);
+				cfv->ctx.head = USHRT_MAX;
+			}
+
+			err = vringh_getdesc_kern(
+				cfv->vr_rx,
+				riov,
+				NULL,
+				&cfv->ctx.head,
+				GFP_ATOMIC);
+
+			if (err <= 0)
+				goto exit;
+		}
+
+		buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
+		/* TODO: Add check on valid buffer address */
+
+		skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
+					     riov->iov[riov->i].iov_len);
+		if (unlikely(err))
+			goto exit;
+
+		/* Push received packet up the stack. */
+		skb_len = skb->len;
+		skb->protocol = htons(ETH_P_CAIF);
+		skb_reset_mac_header(skb);
+		skb->dev = cfv->ndev;
+		err = netif_receive_skb(skb);
+		if (unlikely(err)) {
+			++cfv->ndev->stats.rx_dropped;
+		} else {
+			++cfv->ndev->stats.rx_packets;
+			cfv->ndev->stats.rx_bytes += skb_len;
+		}
+
+		++riov->i;
+		++rxcnt;
+	} while (rxcnt < quota);
+
+	++cfv->stats.rx_napi_resched;
+	goto out;
+
+exit:
+	switch (err) {
+	case 0:
+		++cfv->stats.rx_napi_complete;
+
+		/* Really out of patckets? (stolen from virtio_net)*/
+		napi_complete(napi);
+		if (unlikely(vringh_notify_enable_kern(cfv->vr_rx)) &&
+		    napi_schedule_prep(napi)) {
+			vringh_notify_disable_kern(cfv->vr_rx);
+			__napi_schedule(napi);
+			goto again;
+		}
+		break;
+
+	case -ENOMEM:
+		++cfv->stats.rx_nomem;
+		dev_kfree_skb(skb);
+		/* Stop NAPI poll on OOM, we hope to be polled later */
+		napi_complete(napi);
+		vringh_notify_enable_kern(cfv->vr_rx);
+		break;
+
+	default:
+		/* We're doomed, any modem fault is fatal */
+		netdev_warn(cfv->ndev, "Bad ring, disable device\n");
+		cfv->ndev->stats.rx_dropped = riov->used - riov->i;
+		napi_complete(napi);
+		vringh_notify_disable_kern(cfv->vr_rx);
+		netif_carrier_off(cfv->ndev);
+		break;
+	}
+out:
+	if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
+		vringh_notify(cfv->vr_rx);
+	return rxcnt;
+}
+
+static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
+{
+	struct cfv_info *cfv = vdev->priv;
+
+	++cfv->stats.rx_kicks;
+	vringh_notify_disable_kern(cfv->vr_rx);
+	napi_schedule(&cfv->napi);
+}
+
+static void cfv_destroy_genpool(struct cfv_info *cfv)
+{
+	if (cfv->alloc_addr)
+		dma_free_coherent(cfv->vdev->dev.parent->parent,
+				  cfv->allocsz, cfv->alloc_addr,
+				  cfv->alloc_dma);
+
+	if (!cfv->genpool)
+		return;
+	gen_pool_free(cfv->genpool,  cfv->reserved_mem,
+		      cfv->reserved_size);
+	gen_pool_destroy(cfv->genpool);
+	cfv->genpool = NULL;
+}
+
+static int cfv_create_genpool(struct cfv_info *cfv)
+{
+	int err;
+
+	/* dma_alloc can only allocate whole pages, and we need a more
+	 * fine graned allocation so we use genpool. We ask for space needed
+	 * by IP and a full ring. If the dma allcoation fails we retry with a
+	 * smaller allocation size.
+	 */
+	err = -ENOMEM;
+	cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
+			(ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
+	if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
+		return -EINVAL;
+
+	for (;;) {
+		if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
+			netdev_info(cfv->ndev, "Not enough device memory\n");
+			return -ENOMEM;
+		}
+
+		cfv->alloc_addr = dma_alloc_coherent(
+						cfv->vdev->dev.parent->parent,
+						cfv->allocsz, &cfv->alloc_dma,
+						GFP_ATOMIC);
+		if (cfv->alloc_addr)
+			break;
+
+		cfv->allocsz = (cfv->allocsz * 3) >> 2;
+	}
+
+	netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
+		   cfv->allocsz);
+
+	/* Allocate on 128 bytes boundaries (1 << 7)*/
+	cfv->genpool = gen_pool_create(7, -1);
+	if (!cfv->genpool)
+		goto err;
+
+	err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
+				(phys_addr_t)virt_to_phys(cfv->alloc_addr),
+				cfv->allocsz, -1);
+	if (err)
+		goto err;
+
+	/* Reserve some memory for low memory situations. If we hit the roof
+	 * in the memory pool, we stop TX flow and release the reserve.
+	 */
+	cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
+	cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
+					   cfv->reserved_size);
+	if (!cfv->reserved_mem)
+		goto err;
+
+	cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
+	return 0;
+err:
+	cfv_destroy_genpool(cfv);
+	return err;
+}
+
+/* Enable the CAIF interface and allocate the memory-pool */
+static int cfv_netdev_open(struct net_device *netdev)
+{
+	struct cfv_info *cfv = netdev_priv(netdev);
+
+	if (cfv_create_genpool(cfv))
+		return -ENOMEM;
+
+	netif_carrier_on(netdev);
+	napi_enable(&cfv->napi);
+
+	/* Schedule NAPI to read any pending packets */
+	napi_schedule(&cfv->napi);
+	return 0;
+}
+
+/* Disable the CAIF interface and free the memory-pool */
+static int cfv_netdev_close(struct net_device *netdev)
+{
+	struct cfv_info *cfv = netdev_priv(netdev);
+	unsigned long flags;
+	struct buf_info *buf_info;
+
+	/* Disable interrupts, queues and NAPI polling */
+	netif_carrier_off(netdev);
+	virtqueue_disable_cb(cfv->vq_tx);
+	vringh_notify_disable_kern(cfv->vr_rx);
+	napi_disable(&cfv->napi);
+
+	/* Release any TX buffers on both used and avilable rings */
+	cfv_release_used_buf(cfv->vq_tx);
+	spin_lock_irqsave(&cfv->tx_lock, flags);
+	while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
+		free_buf_info(cfv, buf_info);
+	spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+	/* Release all dma allocated memory and destroy the pool */
+	cfv_destroy_genpool(cfv);
+	return 0;
+}
+
+/* Allocate a buffer in dma-memory and copy skb to it */
+static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
+						       struct sk_buff *skb,
+						       struct scatterlist *sg)
+{
+	struct caif_payload_info *info = (void *)&skb->cb;
+	struct buf_info *buf_info = NULL;
+	u8 pad_len, hdr_ofs;
+
+	if (!cfv->genpool)
+		goto err;
+
+	if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
+		netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
+			    cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
+		goto err;
+	}
+
+	buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
+	if (unlikely(!buf_info))
+		goto err;
+
+	/* Make the IP header aligned in tbe buffer */
+	hdr_ofs = cfv->tx_hr + info->hdr_len;
+	pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
+	buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
+
+	/* allocate dma memory buffer */
+	buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
+	if (unlikely(!buf_info->vaddr))
+		goto err;
+
+	/* copy skbuf contents to send buffer */
+	skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
+	sg_init_one(sg, buf_info->vaddr + pad_len,
+		    skb->len + cfv->tx_hr + cfv->rx_hr);
+
+	return buf_info;
+err:
+	kfree(buf_info);
+	return NULL;
+}
+
+/* Put the CAIF packet on the virtio ring and kick the receiver */
+static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct cfv_info *cfv = netdev_priv(netdev);
+	struct buf_info *buf_info;
+	struct scatterlist sg;
+	unsigned long flags;
+	bool flow_off = false;
+	int ret;
+
+	/* garbage collect released buffers */
+	cfv_release_used_buf(cfv->vq_tx);
+	spin_lock_irqsave(&cfv->tx_lock, flags);
+
+	/* Flow-off check takes into account number of cpus to make sure
+	 * virtqueue will not be overfilled in any possible smp conditions.
+	 *
+	 * Flow-on is triggered when sufficient buffers are freed
+	 */
+	if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
+		flow_off = true;
+		cfv->stats.tx_full_ring++;
+	}
+
+	/* If we run out of memory, we release the memory reserve and retry
+	 * allocation.
+	 */
+	buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+	if (unlikely(!buf_info)) {
+		cfv->stats.tx_no_mem++;
+		flow_off = true;
+
+		if (cfv->reserved_mem && cfv->genpool) {
+			gen_pool_free(cfv->genpool,  cfv->reserved_mem,
+				      cfv->reserved_size);
+			cfv->reserved_mem = 0;
+			buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+		}
+	}
+
+	if (unlikely(flow_off)) {
+		/* Turn flow on when a 1/4 of the descriptors are released */
+		cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
+		/* Enable notifications of recycled TX buffers */
+		virtqueue_enable_cb(cfv->vq_tx);
+		netif_tx_stop_all_queues(netdev);
+	}
+
+	if (unlikely(!buf_info)) {
+		/* If the memory reserve does it's job, this shouldn't happen */
+		netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
+		goto err;
+	}
+
+	ret = virtqueue_add_buf(cfv->vq_tx, &sg, 1, 0,
+				buf_info, GFP_ATOMIC);
+	if (unlikely((ret < 0))) {
+		/* If flow control works, this shouldn't happen */
+		netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
+			    ret);
+		goto err;
+	}
+
+	/* update netdev statistics */
+	cfv->ndev->stats.tx_packets++;
+	cfv->ndev->stats.tx_bytes += skb->len;
+	spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+	/* tell the remote processor it has a pending message to read */
+	virtqueue_kick(cfv->vq_tx);
+
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+err:
+	spin_unlock_irqrestore(&cfv->tx_lock, flags);
+	cfv->ndev->stats.tx_dropped++;
+	free_buf_info(cfv, buf_info);
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void cfv_tx_release_tasklet(unsigned long drv)
+{
+	struct cfv_info *cfv = (struct cfv_info *)drv;
+	cfv_release_used_buf(cfv->vq_tx);
+}
+
+static const struct net_device_ops cfv_netdev_ops = {
+	.ndo_open = cfv_netdev_open,
+	.ndo_stop = cfv_netdev_close,
+	.ndo_start_xmit = cfv_netdev_tx,
+};
+
+static void cfv_netdev_setup(struct net_device *netdev)
+{
+	netdev->netdev_ops = &cfv_netdev_ops;
+	netdev->type = ARPHRD_CAIF;
+	netdev->tx_queue_len = 100;
+	netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
+	netdev->mtu = CFV_DEF_MTU_SIZE;
+	netdev->destructor = free_netdev;
+}
+
+/* Create debugfs counters for the device */
+static inline void debugfs_init(struct cfv_info *cfv)
+{
+	cfv->debugfs =
+		debugfs_create_dir(netdev_name(cfv->ndev), NULL);
+
+	if (IS_ERR(cfv->debugfs))
+		return;
+
+	debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_napi_complete);
+	debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_napi_resched);
+	debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_nomem);
+	debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_kicks);
+	debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_full_ring);
+	debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_no_mem);
+	debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_kicks);
+	debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_flow_on);
+}
+
+/* Setup CAIF for the a virtio device */
+static int cfv_probe(struct virtio_device *vdev)
+{
+	vq_callback_t *vq_cbs = cfv_release_cb;
+	vrh_callback_t *vrh_cbs = cfv_recv;
+	const char *names =  "output";
+	const char *cfv_netdev_name = "cfvrt";
+	struct net_device *netdev;
+	struct cfv_info *cfv;
+	int err = -EINVAL;
+
+	netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
+			      cfv_netdev_setup);
+	if (!netdev)
+		return -ENOMEM;
+
+	cfv = netdev_priv(netdev);
+	cfv->vdev = vdev;
+	cfv->ndev = netdev;
+
+	spin_lock_init(&cfv->tx_lock);
+
+	/* Get the RX virtio ring. This is a "host side vring". */
+	err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
+	if (err)
+		goto err;
+
+	/* Get the TX virtio ring. This is a "guest side vring". */
+	err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
+	if (err)
+		goto err;
+
+	/* Get the CAIF configuration from virtio config space, if available */
+#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
+	((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
+			   &_var, \
+			   FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
+
+	if (vdev->config->get) {
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
+	} else {
+		cfv->tx_hr = CFV_DEF_HEADROOM;
+		cfv->rx_hr = CFV_DEF_HEADROOM;
+		cfv->tx_tr = CFV_DEF_TAILROOM;
+		cfv->rx_tr = CFV_DEF_TAILROOM;
+		cfv->mtu = CFV_DEF_MTU_SIZE;
+		cfv->mru = CFV_DEF_MTU_SIZE;
+	}
+
+	netdev->needed_headroom = cfv->tx_hr;
+	netdev->needed_tailroom = cfv->tx_tr;
+
+	/* Disable buffer release interrupts unless we have stopped TX queues */
+	virtqueue_disable_cb(cfv->vq_tx);
+
+	netdev->mtu = cfv->mtu - cfv->tx_tr;
+	vdev->priv = cfv;
+
+	/* Initialize NAPI poll context data */
+	vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
+	cfv->ctx.head = USHRT_MAX;
+	netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
+
+	tasklet_init(&cfv->tx_release_tasklet,
+		     cfv_tx_release_tasklet,
+		     (unsigned long)cfv);
+
+	/* Carrier is off until netdevice is opened */
+	netif_carrier_off(netdev);
+
+	/* register Netdev */
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
+		goto err;
+	}
+
+	debugfs_init(cfv);
+
+	return 0;
+err:
+	netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
+
+	if (cfv->vr_rx)
+		vdev->vringh_config->del_vrhs(cfv->vdev);
+	if (cfv->vdev)
+		vdev->config->del_vqs(cfv->vdev);
+	free_netdev(netdev);
+	return err;
+}
+
+static void cfv_remove(struct virtio_device *vdev)
+{
+	struct cfv_info *cfv = vdev->priv;
+
+	rtnl_lock();
+	dev_close(cfv->ndev);
+	rtnl_unlock();
+
+	tasklet_kill(&cfv->tx_release_tasklet);
+	debugfs_remove_recursive(cfv->debugfs);
+
+	vringh_kiov_cleanup(&cfv->ctx.riov);
+	vdev->config->reset(vdev);
+	vdev->vringh_config->del_vrhs(cfv->vdev);
+	cfv->vr_rx = NULL;
+	vdev->config->del_vqs(cfv->vdev);
+	unregister_netdev(cfv->ndev);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver caif_virtio_driver = {
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= cfv_probe,
+	.remove			= cfv_remove,
+};
+
+module_virtio_driver(caif_virtio_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
-- 
cgit v1.2.3


From 13816c768d46586e925b22736992258d6105ad2c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:37:09 +1030
Subject: virtio_ring: virtqueue_add_sgs, to add multiple sgs.

virtio_scsi can really use this, to avoid the current hack of copying
the whole sg array.  Some other things get slightly neater, too.

This causes a slowdown in virtqueue_add_buf(), which is implemented as
a wrapper.  This is addressed in the next patches.

for i in `seq 50`; do /usr/bin/time -f 'Wall time:%e' ./vringh_test --indirect --eventidx --parallel --fast-vringh; done 2>&1 | stats --trim-outliers:

Before:
	Using CPUS 0 and 3
	Guest: notified 0, pinged 39009-39063(39062)
	Host: notified 39009-39063(39062), pinged 0
	Wall time:1.700000-1.950000(1.723542)

After:
	Using CPUS 0 and 3
	Guest: notified 0, pinged 39062-39063(39063)
	Host: notified 39062-39063(39063), pinged 0
	Wall time:1.760000-2.220000(1.789167)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/virtio/virtio_ring.c | 220 ++++++++++++++++++++++++++++++-------------
 1 file changed, 157 insertions(+), 63 deletions(-)

(limited to 'drivers')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 245177c286ae..a78ad459cc85 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -98,16 +98,36 @@ struct vring_virtqueue
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
+						  unsigned int *count)
+{
+	return sg_next(sg);
+}
+
+static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
+					      unsigned int *count)
+{
+	if (--(*count) == 0)
+		return NULL;
+	return sg + 1;
+}
+
 /* Set up an indirect table of descriptors and add it to the queue. */
-static int vring_add_indirect(struct vring_virtqueue *vq,
-			      struct scatterlist sg[],
-			      unsigned int out,
-			      unsigned int in,
-			      gfp_t gfp)
+static inline int vring_add_indirect(struct vring_virtqueue *vq,
+				     struct scatterlist *sgs[],
+				     struct scatterlist *(*next)
+				       (struct scatterlist *, unsigned int *),
+				     unsigned int total_sg,
+				     unsigned int total_out,
+				     unsigned int total_in,
+				     unsigned int out_sgs,
+				     unsigned int in_sgs,
+				     gfp_t gfp)
 {
 	struct vring_desc *desc;
 	unsigned head;
-	int i;
+	struct scatterlist *sg;
+	int i, n;
 
 	/*
 	 * We require lowmem mappings for the descriptors because
@@ -116,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	 */
 	gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
 
-	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
 	if (!desc)
 		return -ENOMEM;
 
-	/* Transfer entries from the sg list into the indirect page */
-	for (i = 0; i < out; i++) {
-		desc[i].flags = VRING_DESC_F_NEXT;
-		desc[i].addr = sg_phys(sg);
-		desc[i].len = sg->length;
-		desc[i].next = i+1;
-		sg++;
+	/* Transfer entries from the sg lists into the indirect page */
+	i = 0;
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+			desc[i].flags = VRING_DESC_F_NEXT;
+			desc[i].addr = sg_phys(sg);
+			desc[i].len = sg->length;
+			desc[i].next = i+1;
+			i++;
+		}
 	}
-	for (; i < (out + in); i++) {
-		desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-		desc[i].addr = sg_phys(sg);
-		desc[i].len = sg->length;
-		desc[i].next = i+1;
-		sg++;
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+			desc[i].addr = sg_phys(sg);
+			desc[i].len = sg->length;
+			desc[i].next = i+1;
+			i++;
+		}
 	}
+	BUG_ON(i != total_sg);
 
 	/* Last one doesn't continue. */
 	desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@@ -155,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	return head;
 }
 
-/**
- * virtqueue_add_buf - expose buffer to other end
- * @vq: the struct virtqueue we're talking about.
- * @sg: the description of the buffer(s).
- * @out_num: the number of sg readable by other side
- * @in_num: the number of sg which are writable (after readable ones)
- * @data: the token identifying the buffer.
- * @gfp: how to do memory allocations (if necessary).
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
- */
-int virtqueue_add_buf(struct virtqueue *_vq,
-		      struct scatterlist sg[],
-		      unsigned int out,
-		      unsigned int in,
-		      void *data,
-		      gfp_t gfp)
+static inline int virtqueue_add(struct virtqueue *_vq,
+				struct scatterlist *sgs[],
+				struct scatterlist *(*next)
+				  (struct scatterlist *, unsigned int *),
+				unsigned int total_out,
+				unsigned int total_in,
+				unsigned int out_sgs,
+				unsigned int in_sgs,
+				void *data,
+				gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	unsigned int i, avail, uninitialized_var(prev);
+	struct scatterlist *sg;
+	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
 	int head;
 
 	START_USE(vq);
@@ -197,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
 	}
 #endif
 
+	total_sg = total_in + total_out;
+
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && (out + in) > 1 && vq->vq.num_free) {
-		head = vring_add_indirect(vq, sg, out, in, gfp);
+	if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
+		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+					  total_in,
+					  out_sgs, in_sgs, gfp);
 		if (likely(head >= 0))
 			goto add_head;
 	}
 
-	BUG_ON(out + in > vq->vring.num);
-	BUG_ON(out + in == 0);
+	BUG_ON(total_sg > vq->vring.num);
+	BUG_ON(total_sg == 0);
 
-	if (vq->vq.num_free < out + in) {
+	if (vq->vq.num_free < total_sg) {
 		pr_debug("Can't add buf len %i - avail = %i\n",
-			 out + in, vq->vq.num_free);
+			 total_sg, vq->vq.num_free);
 		/* FIXME: for historical reasons, we force a notify here if
 		 * there are outgoing parts to the buffer.  Presumably the
 		 * host should service the ring ASAP. */
-		if (out)
+		if (out_sgs)
 			vq->notify(&vq->vq);
 		END_USE(vq);
 		return -ENOSPC;
 	}
 
 	/* We're about to use some buffers from the free list. */
-	vq->vq.num_free -= out + in;
-
-	head = vq->free_head;
-	for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
-		vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
-		vq->vring.desc[i].addr = sg_phys(sg);
-		vq->vring.desc[i].len = sg->length;
-		prev = i;
-		sg++;
+	vq->vq.num_free -= total_sg;
+
+	head = i = vq->free_head;
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
+			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].len = sg->length;
+			prev = i;
+			i = vq->vring.desc[i].next;
+		}
 	}
-	for (; in; i = vq->vring.desc[i].next, in--) {
-		vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-		vq->vring.desc[i].addr = sg_phys(sg);
-		vq->vring.desc[i].len = sg->length;
-		prev = i;
-		sg++;
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].len = sg->length;
+			prev = i;
+			i = vq->vring.desc[i].next;
+		}
 	}
 	/* Last one doesn't continue. */
 	vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
@@ -269,8 +294,77 @@ add_head:
 
 	return 0;
 }
+
+/**
+ * virtqueue_add_buf - expose buffer to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: the description of the buffer(s).
+ * @out_num: the number of sg readable by other side
+ * @in_num: the number of sg which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_buf(struct virtqueue *_vq,
+		      struct scatterlist sg[],
+		      unsigned int out,
+		      unsigned int in,
+		      void *data,
+		      gfp_t gfp)
+{
+	struct scatterlist *sgs[2];
+
+	sgs[0] = sg;
+	sgs[1] = sg + out;
+
+	return virtqueue_add(_vq, sgs, sg_next_arr,
+			     out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
+}
 EXPORT_SYMBOL_GPL(virtqueue_add_buf);
 
+/**
+ * virtqueue_add_sgs - expose buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of terminated scatterlists.
+ * @out_num: the number of scatterlists readable by other side
+ * @in_num: the number of scatterlists which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_sgs(struct virtqueue *_vq,
+		      struct scatterlist *sgs[],
+		      unsigned int out_sgs,
+		      unsigned int in_sgs,
+		      void *data,
+		      gfp_t gfp)
+{
+	unsigned int i, total_out, total_in;
+
+	/* Count them first. */
+	for (i = total_out = total_in = 0; i < out_sgs; i++) {
+		struct scatterlist *sg;
+		for (sg = sgs[i]; sg; sg = sg_next(sg))
+			total_out++;
+	}
+	for (; i < out_sgs + in_sgs; i++) {
+		struct scatterlist *sg;
+		for (sg = sgs[i]; sg; sg = sg_next(sg))
+			total_in++;
+	}
+	return virtqueue_add(_vq, sgs, sg_next_chained,
+			     total_out, total_in, out_sgs, in_sgs, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @vq: the struct virtqueue
-- 
cgit v1.2.3


From 282edb36499042a92b71f052f51754ae7ed936e4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:26 +1030
Subject: virtio_ring: virtqueue_add_outbuf / virtqueue_add_inbuf.

These are specialized versions of virtqueue_add_buf(), which cover
over 80% of cases and are far clearer.

In particular, the scatterlists passed to these functions don't have
to be clean (ie. we ignore end markers).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'drivers')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a78ad459cc85..5217baf5528c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -365,6 +365,50 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 
+/**
+ * virtqueue_add_outbuf - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_outbuf(struct virtqueue *vq,
+			 struct scatterlist sg[], unsigned int num,
+			 void *data,
+			 gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
+
+/**
+ * virtqueue_add_inbuf - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists writable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_inbuf(struct virtqueue *vq,
+			struct scatterlist sg[], unsigned int num,
+			void *data,
+			gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @vq: the struct virtqueue
-- 
cgit v1.2.3


From 5ee21a52c05b5670ceeaa502c15cf306e379f714 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Mar 2013 15:44:27 +1030
Subject: virtio-blk: reorganize virtblk_add_req

Right now, both virtblk_add_req and virtblk_add_req_wait call
virtqueue_add_buf.  To prepare for the next patches, abstract the call
to virtqueue_add_buf into a new function __virtblk_add_req, and include
the waiting logic directly in virtblk_add_req.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c | 55 +++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 35 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 922bcb97e23a..b271650032fa 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -100,50 +100,39 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
 	return vbr;
 }
 
-static void virtblk_add_buf_wait(struct virtio_blk *vblk,
-				 struct virtblk_req *vbr,
-				 unsigned long out,
-				 unsigned long in)
+static inline int __virtblk_add_req(struct virtqueue *vq,
+			     struct virtblk_req *vbr,
+			     unsigned long out,
+			     unsigned long in)
 {
+	return virtqueue_add_buf(vq, vbr->sg, out, in, vbr, GFP_ATOMIC);
+}
+
+static void virtblk_add_req(struct virtblk_req *vbr,
+			    unsigned int out, unsigned int in)
+{
+	struct virtio_blk *vblk = vbr->vblk;
 	DEFINE_WAIT(wait);
+	int ret;
 
-	for (;;) {
+	spin_lock_irq(vblk->disk->queue->queue_lock);
+	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr,
+						 out, in)) < 0)) {
 		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+		io_schedule();
 		spin_lock_irq(vblk->disk->queue->queue_lock);
-		if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
-				      GFP_ATOMIC) < 0) {
-			spin_unlock_irq(vblk->disk->queue->queue_lock);
-			io_schedule();
-		} else {
-			virtqueue_kick(vblk->vq);
-			spin_unlock_irq(vblk->disk->queue->queue_lock);
-			break;
-		}
 
+		finish_wait(&vblk->queue_wait, &wait);
 	}
 
-	finish_wait(&vblk->queue_wait, &wait);
-}
-
-static inline void virtblk_add_req(struct virtblk_req *vbr,
-				   unsigned int out, unsigned int in)
-{
-	struct virtio_blk *vblk = vbr->vblk;
-
-	spin_lock_irq(vblk->disk->queue->queue_lock);
-	if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
-					GFP_ATOMIC) < 0)) {
-		spin_unlock_irq(vblk->disk->queue->queue_lock);
-		virtblk_add_buf_wait(vblk, vbr, out, in);
-		return;
-	}
 	virtqueue_kick(vblk->vq);
 	spin_unlock_irq(vblk->disk->queue->queue_lock);
 }
 
-static int virtblk_bio_send_flush(struct virtblk_req *vbr)
+static void virtblk_bio_send_flush(struct virtblk_req *vbr)
 {
 	unsigned int out = 0, in = 0;
 
@@ -155,11 +144,9 @@ static int virtblk_bio_send_flush(struct virtblk_req *vbr)
 	sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
 
 	virtblk_add_req(vbr, out, in);
-
-	return 0;
 }
 
-static int virtblk_bio_send_data(struct virtblk_req *vbr)
+static void virtblk_bio_send_data(struct virtblk_req *vbr)
 {
 	struct virtio_blk *vblk = vbr->vblk;
 	unsigned int num, out = 0, in = 0;
@@ -188,8 +175,6 @@ static int virtblk_bio_send_data(struct virtblk_req *vbr)
 	}
 
 	virtblk_add_req(vbr, out, in);
-
-	return 0;
 }
 
 static void virtblk_bio_send_data_work(struct work_struct *work)
-- 
cgit v1.2.3


From 8f39db9d3709afe944710f124111ec87467d25c7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Mar 2013 15:44:27 +1030
Subject: virtio-blk: use virtqueue_add_sgs on bio path

(This is a respin of Paolo Bonzini's patch, but it calls
virtqueue_add_sgs() instead of his multi-part API).

Move the creation of the request header and response footer to
__virtblk_add_req.  vbr->sg only contains the data scatterlist,
the header/footer are added separately using virtqueue_add_sgs().

With this change, virtio-blk (with use_bio) is not relying anymore on
the virtio functions ignoring the end markers in a scatterlist.
The next patch will do the same for the other path.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/block/virtio_blk.c | 58 +++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b271650032fa..cfbe39d35277 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -62,6 +62,7 @@ struct virtblk_req
 	struct virtio_blk *vblk;
 	int flags;
 	u8 status;
+	int nents;
 	struct scatterlist sg[];
 };
 
@@ -100,24 +101,36 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
 	return vbr;
 }
 
-static inline int __virtblk_add_req(struct virtqueue *vq,
-			     struct virtblk_req *vbr,
-			     unsigned long out,
-			     unsigned long in)
+static int __virtblk_add_req(struct virtqueue *vq,
+			     struct virtblk_req *vbr)
 {
-	return virtqueue_add_buf(vq, vbr->sg, out, in, vbr, GFP_ATOMIC);
+	struct scatterlist hdr, status, *sgs[3];
+	unsigned int num_out = 0, num_in = 0;
+
+	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+	sgs[num_out++] = &hdr;
+
+	if (vbr->nents) {
+		if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+			sgs[num_out++] = vbr->sg;
+		else
+			sgs[num_out + num_in++] = vbr->sg;
+	}
+
+	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+	sgs[num_out + num_in++] = &status;
+
+	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
-static void virtblk_add_req(struct virtblk_req *vbr,
-			    unsigned int out, unsigned int in)
+static void virtblk_add_req(struct virtblk_req *vbr)
 {
 	struct virtio_blk *vblk = vbr->vblk;
 	DEFINE_WAIT(wait);
 	int ret;
 
 	spin_lock_irq(vblk->disk->queue->queue_lock);
-	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr,
-						 out, in)) < 0)) {
+	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr)) < 0)) {
 		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
@@ -134,22 +147,18 @@ static void virtblk_add_req(struct virtblk_req *vbr,
 
 static void virtblk_bio_send_flush(struct virtblk_req *vbr)
 {
-	unsigned int out = 0, in = 0;
-
 	vbr->flags |= VBLK_IS_FLUSH;
 	vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 	vbr->out_hdr.sector = 0;
 	vbr->out_hdr.ioprio = 0;
-	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-	sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
+	vbr->nents = 0;
 
-	virtblk_add_req(vbr, out, in);
+	virtblk_add_req(vbr);
 }
 
 static void virtblk_bio_send_data(struct virtblk_req *vbr)
 {
 	struct virtio_blk *vblk = vbr->vblk;
-	unsigned int num, out = 0, in = 0;
 	struct bio *bio = vbr->bio;
 
 	vbr->flags &= ~VBLK_IS_FLUSH;
@@ -157,24 +166,15 @@ static void virtblk_bio_send_data(struct virtblk_req *vbr)
 	vbr->out_hdr.sector = bio->bi_sector;
 	vbr->out_hdr.ioprio = bio_prio(bio);
 
-	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
-	num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
-
-	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
-		   sizeof(vbr->status));
-
-	if (num) {
-		if (bio->bi_rw & REQ_WRITE) {
+	vbr->nents = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg);
+	if (vbr->nents) {
+		if (bio->bi_rw & REQ_WRITE)
 			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-			out += num;
-		} else {
+		else
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-			in += num;
-		}
 	}
 
-	virtblk_add_req(vbr, out, in);
+	virtblk_add_req(vbr);
 }
 
 static void virtblk_bio_send_data_work(struct work_struct *work)
-- 
cgit v1.2.3


From 20af3cfd20145fece208ada7cb10e1fd7f21f128 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Mar 2013 15:44:27 +1030
Subject: virtio-blk: use virtqueue_add_sgs on req path

(This is a respin of Paolo Bonzini's patch, but it calls
virtqueue_add_sgs() instead of his multi-part API).

This is similar to the previous patch, but a bit more radical
because the bio and req paths now share the buffer construction
code.  Because the req path doesn't use vbr->sg, however, we
need to add a couple of arguments to __virtblk_add_req.

We also need to teach __virtblk_add_req how to build SCSI command
requests.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/block/virtio_blk.c | 69 ++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 36 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cfbe39d35277..cc88b29c6393 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -102,19 +102,40 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
 }
 
 static int __virtblk_add_req(struct virtqueue *vq,
-			     struct virtblk_req *vbr)
+			     struct virtblk_req *vbr,
+			     struct scatterlist *data_sg,
+			     unsigned data_nents)
 {
-	struct scatterlist hdr, status, *sgs[3];
+	struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
 	unsigned int num_out = 0, num_in = 0;
+	int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
 
 	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
 	sgs[num_out++] = &hdr;
 
-	if (vbr->nents) {
+	/*
+	 * If this is a packet command we need a couple of additional headers.
+	 * Behind the normal outhdr we put a segment with the scsi command
+	 * block, and before the normal inhdr we put the sense data and the
+	 * inhdr with additional status information.
+	 */
+	if (type == VIRTIO_BLK_T_SCSI_CMD) {
+		sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+		sgs[num_out++] = &cmd;
+	}
+
+	if (data_nents) {
 		if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
-			sgs[num_out++] = vbr->sg;
+			sgs[num_out++] = data_sg;
 		else
-			sgs[num_out + num_in++] = vbr->sg;
+			sgs[num_out + num_in++] = data_sg;
+	}
+
+	if (type == VIRTIO_BLK_T_SCSI_CMD) {
+		sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+		sgs[num_out + num_in++] = &sense;
+		sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+		sgs[num_out + num_in++] = &inhdr;
 	}
 
 	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
@@ -130,7 +151,8 @@ static void virtblk_add_req(struct virtblk_req *vbr)
 	int ret;
 
 	spin_lock_irq(vblk->disk->queue->queue_lock);
-	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr)) < 0)) {
+	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
+						 vbr->nents)) < 0)) {
 		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
@@ -283,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		   struct request *req)
 {
-	unsigned long num, out = 0, in = 0;
+	unsigned int num;
 	struct virtblk_req *vbr;
 
 	vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@@ -320,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
-	/*
-	 * If this is a packet command we need a couple of additional headers.
-	 * Behind the normal outhdr we put a segment with the scsi command
-	 * block, and before the normal inhdr we put the sense data and the
-	 * inhdr with additional status information before the normal inhdr.
-	 */
-	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
-		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
-
-	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
-
-	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
-		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
-		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
-			   sizeof(vbr->in_hdr));
-	}
-
-	sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
-		   sizeof(vbr->status));
-
+	num = blk_rq_map_sg(q, vbr->req, vblk->sg);
 	if (num) {
-		if (rq_data_dir(vbr->req) == WRITE) {
+		if (rq_data_dir(vbr->req) == WRITE)
 			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-			out += num;
-		} else {
+		else
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-			in += num;
-		}
 	}
 
-	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
-			      GFP_ATOMIC) < 0) {
+	if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
-- 
cgit v1.2.3


From 0a11cc36f7b33fa2de0ad95199d2f2ab896fbd93 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:27 +1030
Subject: virtio_blk: remove nents member.

It's simply a flag as to whether we have data now, so make it an
explicit function parameter rather than a member of struct
virtblk_req.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/block/virtio_blk.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cc88b29c6393..64723953e1c9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -62,7 +62,6 @@ struct virtblk_req
 	struct virtio_blk *vblk;
 	int flags;
 	u8 status;
-	int nents;
 	struct scatterlist sg[];
 };
 
@@ -104,7 +103,7 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
 static int __virtblk_add_req(struct virtqueue *vq,
 			     struct virtblk_req *vbr,
 			     struct scatterlist *data_sg,
-			     unsigned data_nents)
+			     bool have_data)
 {
 	struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
 	unsigned int num_out = 0, num_in = 0;
@@ -124,7 +123,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 		sgs[num_out++] = &cmd;
 	}
 
-	if (data_nents) {
+	if (have_data) {
 		if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
 			sgs[num_out++] = data_sg;
 		else
@@ -144,7 +143,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
-static void virtblk_add_req(struct virtblk_req *vbr)
+static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
 {
 	struct virtio_blk *vblk = vbr->vblk;
 	DEFINE_WAIT(wait);
@@ -152,7 +151,7 @@ static void virtblk_add_req(struct virtblk_req *vbr)
 
 	spin_lock_irq(vblk->disk->queue->queue_lock);
 	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
-						 vbr->nents)) < 0)) {
+						 have_data)) < 0)) {
 		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
 					  TASK_UNINTERRUPTIBLE);
 
@@ -173,30 +172,31 @@ static void virtblk_bio_send_flush(struct virtblk_req *vbr)
 	vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 	vbr->out_hdr.sector = 0;
 	vbr->out_hdr.ioprio = 0;
-	vbr->nents = 0;
 
-	virtblk_add_req(vbr);
+	virtblk_add_req(vbr, false);
 }
 
 static void virtblk_bio_send_data(struct virtblk_req *vbr)
 {
 	struct virtio_blk *vblk = vbr->vblk;
 	struct bio *bio = vbr->bio;
+	bool have_data;
 
 	vbr->flags &= ~VBLK_IS_FLUSH;
 	vbr->out_hdr.type = 0;
 	vbr->out_hdr.sector = bio->bi_sector;
 	vbr->out_hdr.ioprio = bio_prio(bio);
 
-	vbr->nents = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg);
-	if (vbr->nents) {
+	if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
+		have_data = true;
 		if (bio->bi_rw & REQ_WRITE)
 			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
 		else
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-	}
+	} else
+		have_data = false;
 
-	virtblk_add_req(vbr);
+	virtblk_add_req(vbr, have_data);
 }
 
 static void virtblk_bio_send_data_work(struct work_struct *work)
-- 
cgit v1.2.3


From 682993b4e445bdfe9935d5e6e298565b7e11d7ee Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Wed, 20 Mar 2013 15:44:28 +1030
Subject: virtio-scsi: use virtqueue_add_sgs for command buffers

Using the new virtqueue_add_sgs function lets us simplify the queueing
path.  In particular, all data protected by the tgt_lock is just gone
(multiqueue will find a new use for the lock).

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 100 +++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 63 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 0f5dd2804ae5..77206d0eb6a9 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -61,11 +61,8 @@ struct virtio_scsi_vq {
 
 /* Per-target queue state */
 struct virtio_scsi_target_state {
-	/* Protects sg.  Lock hierarchy is tgt_lock -> vq_lock.  */
+	/* Never held at the same time as vq_lock.  */
 	spinlock_t tgt_lock;
-
-	/* For sglist construction when adding commands to the virtqueue.  */
-	struct scatterlist sg[];
 };
 
 /* Driver instance state */
@@ -353,75 +350,61 @@ static void virtscsi_event_done(struct virtqueue *vq)
 	spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
 };
 
-static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
-			     struct scsi_data_buffer *sdb)
-{
-	struct sg_table *table = &sdb->table;
-	struct scatterlist *sg_elem;
-	unsigned int idx = *p_idx;
-	int i;
-
-	for_each_sg(table->sgl, sg_elem, table->nents, i)
-		sg[idx++] = *sg_elem;
-
-	*p_idx = idx;
-}
-
 /**
- * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
- * @vscsi	: virtio_scsi state
+ * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
+ * @vq		: the struct virtqueue we're talking about
  * @cmd		: command structure
- * @out_num	: number of read-only elements
- * @in_num	: number of write-only elements
  * @req_size	: size of the request buffer
  * @resp_size	: size of the response buffer
- *
- * Called with tgt_lock held.
+ * @gfp	: flags to use for memory allocations
  */
-static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt,
-			     struct virtio_scsi_cmd *cmd,
-			     unsigned *out_num, unsigned *in_num,
-			     size_t req_size, size_t resp_size)
+static int virtscsi_add_cmd(struct virtqueue *vq,
+			    struct virtio_scsi_cmd *cmd,
+			    size_t req_size, size_t resp_size, gfp_t gfp)
 {
 	struct scsi_cmnd *sc = cmd->sc;
-	struct scatterlist *sg = tgt->sg;
-	unsigned int idx = 0;
+	struct scatterlist *sgs[4], req, resp;
+	struct sg_table *out, *in;
+	unsigned out_num = 0, in_num = 0;
+
+	out = in = NULL;
+
+	if (sc && sc->sc_data_direction != DMA_NONE) {
+		if (sc->sc_data_direction != DMA_FROM_DEVICE)
+			out = &scsi_out(sc)->table;
+		if (sc->sc_data_direction != DMA_TO_DEVICE)
+			in = &scsi_in(sc)->table;
+	}
 
 	/* Request header.  */
-	sg_set_buf(&sg[idx++], &cmd->req, req_size);
+	sg_init_one(&req, &cmd->req, req_size);
+	sgs[out_num++] = &req;
 
 	/* Data-out buffer.  */
-	if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
-		virtscsi_map_sgl(sg, &idx, scsi_out(sc));
-
-	*out_num = idx;
+	if (out)
+		sgs[out_num++] = out->sgl;
 
 	/* Response header.  */
-	sg_set_buf(&sg[idx++], &cmd->resp, resp_size);
+	sg_init_one(&resp, &cmd->resp, resp_size);
+	sgs[out_num + in_num++] = &resp;
 
 	/* Data-in buffer */
-	if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
-		virtscsi_map_sgl(sg, &idx, scsi_in(sc));
+	if (in)
+		sgs[out_num + in_num++] = in->sgl;
 
-	*in_num = idx - *out_num;
+	return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
 }
 
-static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
-			     struct virtio_scsi_vq *vq,
+static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
 			     struct virtio_scsi_cmd *cmd,
 			     size_t req_size, size_t resp_size, gfp_t gfp)
 {
-	unsigned int out_num, in_num;
 	unsigned long flags;
 	int err;
 	bool needs_kick = false;
 
-	spin_lock_irqsave(&tgt->tgt_lock, flags);
-	virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size);
-
-	spin_lock(&vq->vq_lock);
-	err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
-	spin_unlock(&tgt->tgt_lock);
+	spin_lock_irqsave(&vq->vq_lock, flags);
+	err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
 	if (!err)
 		needs_kick = virtqueue_kick_prepare(vq->vq);
 
@@ -435,7 +418,6 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
 static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 {
 	struct virtio_scsi *vscsi = shost_priv(sh);
-	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
 	struct virtio_scsi_cmd *cmd;
 	int ret;
 
@@ -469,7 +451,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
 	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
 
-	if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
+	if (virtscsi_kick_cmd(&vscsi->req_vq, cmd,
 			      sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
 			      GFP_ATOMIC) == 0)
 		ret = 0;
@@ -483,11 +465,10 @@ out:
 static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 {
 	DECLARE_COMPLETION_ONSTACK(comp);
-	struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
 	int ret = FAILED;
 
 	cmd->comp = &comp;
-	if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd,
+	if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
 			      sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
 			      GFP_NOIO) < 0)
 		goto out;
@@ -588,20 +569,16 @@ static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 }
 
 static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
-	struct virtio_device *vdev, int sg_elems)
+	struct virtio_device *vdev)
 {
 	struct virtio_scsi_target_state *tgt;
 	gfp_t gfp_mask = GFP_KERNEL;
 
-	/* We need extra sg elements at head and tail.  */
-	tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
-		      gfp_mask);
-
+	tgt = kmalloc(sizeof(*tgt), gfp_mask);
 	if (!tgt)
 		return NULL;
 
 	spin_lock_init(&tgt->tgt_lock);
-	sg_init_table(tgt->sg, sg_elems + 2);
 	return tgt;
 }
 
@@ -635,7 +612,7 @@ static int virtscsi_init(struct virtio_device *vdev,
 {
 	int err;
 	struct virtqueue *vqs[3];
-	u32 i, sg_elems;
+	u32 i;
 
 	vq_callback_t *callbacks[] = {
 		virtscsi_ctrl_done,
@@ -663,11 +640,8 @@ static int virtscsi_init(struct virtio_device *vdev,
 	if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
 		virtscsi_kick_event_all(vscsi);
 
-	/* We need to know how many segments before we allocate.  */
-	sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
-
 	for (i = 0; i < num_targets; i++) {
-		vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
+		vscsi->tgt[i] = virtscsi_alloc_tgt(vdev);
 		if (!vscsi->tgt[i]) {
 			err = -ENOMEM;
 			goto out;
-- 
cgit v1.2.3


From bf9582910b26525d4eeaa9840b07e7bf820f04fb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:28 +1030
Subject: virtio_scsi: use virtqueue_add_inbuf() for virtscsi_kick_event.

It's a bit clearer, and add_buf is going away.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/scsi/virtio_scsi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 77206d0eb6a9..b53ba9e18f47 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -222,8 +222,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
 
 	spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
 
-	err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node,
-				GFP_ATOMIC);
+	err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
+				  GFP_ATOMIC);
 	if (!err)
 		virtqueue_kick(vscsi->event_vq.vq);
 
-- 
cgit v1.2.3


From f7bc9594513d8f5a6e88e1486d48687ce5831834 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:28 +1030
Subject: virtio_net: use virtqueue_add_sgs[] for command buffers.

It's a bit cleaner to hand multiple sgs, rather than one big one.

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Tested-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/virtio_net.c | 51 ++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 57ac4b0294bc..be704876af8a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
-#define VIRTNET_SEND_COMMAND_SG_MAX    2
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
 struct virtnet_stats {
@@ -767,32 +766,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
  * never fail unless improperly formated.
  */
 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
-				 struct scatterlist *data, int out, int in)
+				 struct scatterlist *out,
+				 struct scatterlist *in)
 {
-	struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+	struct scatterlist *sgs[4], hdr, stat;
 	struct virtio_net_ctrl_hdr ctrl;
 	virtio_net_ctrl_ack status = ~0;
-	unsigned int tmp;
-	int i;
+	unsigned out_num = 0, in_num = 0, tmp;
 
 	/* Caller should know better */
-	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ||
-		(out + in > VIRTNET_SEND_COMMAND_SG_MAX));
-
-	out++; /* Add header */
-	in++; /* Add return status */
+	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
 
 	ctrl.class = class;
 	ctrl.cmd = cmd;
+	/* Add header */
+	sg_init_one(&hdr, &ctrl, sizeof(ctrl));
+	sgs[out_num++] = &hdr;
 
-	sg_init_table(sg, out + in);
+	if (out)
+		sgs[out_num++] = out;
+	if (in)
+		sgs[out_num + in_num++] = in;
 
-	sg_set_buf(&sg[0], &ctrl, sizeof(ctrl));
-	for_each_sg(data, s, out + in - 2, i)
-		sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
-	sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
+	/* Add return status. */
+	sg_init_one(&stat, &status, sizeof(status));
+	sgs[out_num + in_num++] = &stat;
 
-	BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0);
+	BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
+	BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
+	       < 0);
 
 	virtqueue_kick(vi->cvq);
 
@@ -821,7 +823,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
 		sg_init_one(&sg, addr->sa_data, dev->addr_len);
 		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
 					  VIRTIO_NET_CTRL_MAC_ADDR_SET,
-					  &sg, 1, 0)) {
+					  &sg, NULL)) {
 			dev_warn(&vdev->dev,
 				 "Failed to set mac address by vq command.\n");
 			return -EINVAL;
@@ -889,8 +891,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
 {
 	rtnl_lock();
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
-				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL,
-				  0, 0))
+				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
 		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
 	rtnl_unlock();
 }
@@ -908,7 +909,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 	sg_init_one(&sg, &s, sizeof(s));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
-				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
+				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
 		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
 			 queue_pairs);
 		return -EINVAL;
@@ -955,7 +956,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_PROMISC,
-				  sg, 1, 0))
+				  sg, NULL))
 		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
 			 promisc ? "en" : "dis");
 
@@ -963,7 +964,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
 				  VIRTIO_NET_CTRL_RX_ALLMULTI,
-				  sg, 1, 0))
+				  sg, NULL))
 		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
 			 allmulti ? "en" : "dis");
 
@@ -1000,7 +1001,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
 				  VIRTIO_NET_CTRL_MAC_TABLE_SET,
-				  sg, 2, 0))
+				  sg, NULL))
 		dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
 
 	kfree(buf);
@@ -1014,7 +1015,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, u16 vid)
 	sg_init_one(&sg, &vid, sizeof(vid));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
-				  VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0))
+				  VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
 		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
 	return 0;
 }
@@ -1027,7 +1028,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
 	sg_init_one(&sg, &vid, sizeof(vid));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
-				  VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0))
+				  VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
 		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
 	return 0;
 }
-- 
cgit v1.2.3


From 9dc7b9e4d0a6daac5b1f29a338911d63d34533cd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:28 +1030
Subject: virtio_net: use simplified virtqueue accessors.

We never add buffers with input and output parts, so use the new accessors.

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/virtio_net.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index be704876af8a..d88d4366d9ac 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -443,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 
 	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
 
-	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
@@ -488,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
-				first, gfp);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
+				  first, gfp);
 	if (err < 0)
 		give_pages(rq, first);
 
@@ -507,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 
 	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
 
-	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
 	if (err < 0)
 		give_pages(rq, page);
 
@@ -710,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
 
 	num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
-	return virtqueue_add_buf(sq->vq, sq->sg, num_sg,
-				 0, skb, GFP_ATOMIC);
+	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
-- 
cgit v1.2.3


From fb6aa6fcfec29932122cb0fb2d5d1f7700a9883b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:29 +1030
Subject: virtio_rng: use simplified virtqueue accessors.

We never add buffers with input and output parts, so use the new accessors.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/char/hw_random/virtio-rng.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 10fd71ccf587..842e2d55d335 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
 	sg_init_one(&sg, buf, size);
 
 	/* There should always be room for one buffer. */
-	if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0)
+	if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
 		BUG();
 
 	virtqueue_kick(vq);
-- 
cgit v1.2.3


From 6797999d99587e7b4189cf24c8f1053e02444703 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:29 +1030
Subject: virtio_console: use simplified virtqueue accessors.

We never add buffers with input and output parts, so use the new accessors.

Acked-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/virtio_console.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index e905d5f53051..6d59f166e0e9 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -502,7 +502,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
 
 	sg_init_one(sg, buf->buf, buf->size);
 
-	ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
+	ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
 	virtqueue_kick(vq);
 	if (!ret)
 		ret = vq->num_free;
@@ -569,7 +569,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
 	vq = portdev->c_ovq;
 
 	sg_init_one(sg, &cpkt, sizeof(cpkt));
-	if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) {
+	if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
 		virtqueue_kick(vq);
 		while (!virtqueue_get_buf(vq, &len))
 			cpu_relax();
@@ -618,7 +618,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
 
 	reclaim_consumed_buffers(port);
 
-	err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
+	err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
 
 	/* Tell Host to go! */
 	virtqueue_kick(out_vq);
-- 
cgit v1.2.3


From 71bcbecc89a6b24f2c60d3e4271e76013fa46860 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:29 +1030
Subject: caif_virtio: use simplified virtqueue accessors.

We never add buffers with input and output parts, so use the new accessors.

Cc: Sjur Brendeland <sjur.brandeland@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/caif/caif_virtio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index b1e1205e4e28..f6caa1eb4cd6 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -572,8 +572,7 @@ static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
 		goto err;
 	}
 
-	ret = virtqueue_add_buf(cfv->vq_tx, &sg, 1, 0,
-				buf_info, GFP_ATOMIC);
+	ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
 	if (unlikely((ret < 0))) {
 		/* If flow control works, this shouldn't happen */
 		netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
-- 
cgit v1.2.3


From cee51d69a45b6ce202d1d17551165fb3c76dfdb9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:29 +1030
Subject: virtio_rpmsg_bus: use simplified virtqueue accessors.

We never add buffers with input and output parts, so use the new accessors.

Cc: Ohad Ben-Cohen <ohad@wizery.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/rpmsg/virtio_rpmsg_bus.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index a59684b5fc68..33d827b30e95 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
 	mutex_lock(&vrp->tx_lock);
 
 	/* add message to the remote processor's virtqueue */
-	err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL);
+	err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
 	if (err) {
 		/*
 		 * need to reclaim the buffer here, otherwise it's lost
 		 * (memory won't leak, but rpmsg won't use it again for TX).
 		 * this will wait for a buffer management overhaul.
 		 */
-		dev_err(dev, "virtqueue_add_buf failed: %d\n", err);
+		dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
 		goto out;
 	}
 
@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
 	sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
 
 	/* add the buffer back to the remote processor's virtqueue */
-	err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL);
+	err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
 	if (err < 0) {
 		dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
 		return;
@@ -970,7 +970,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
 
 		sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
 
-		err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr,
+		err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
 								GFP_KERNEL);
 		WARN_ON(err); /* sanity check; this can't really happen */
 	}
-- 
cgit v1.2.3


From 92549abc6a6573294fc1bb9330db8b52dedfea5f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:30 +1030
Subject: virtio_balloon: use simplified virtqueue accessors.

We never add buffers with input and output parts, so use the new accessors.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_balloon.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8dab163c5ef0..bd3ae324a1a2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 	sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
 
 	/* We should always be able to add one buffer to an empty queue. */
-	if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+	if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
 		BUG();
 	virtqueue_kick(vq);
 
@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
 	if (!virtqueue_get_buf(vq, &len))
 		return;
 	sg_init_one(&sg, vb->stats, sizeof(vb->stats));
-	if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+	if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
 		BUG();
 	virtqueue_kick(vq);
 }
@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
 		 * use it to signal us later.
 		 */
 		sg_init_one(&sg, vb->stats, sizeof vb->stats);
-		if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL)
+		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
 		    < 0)
 			BUG();
 		virtqueue_kick(vb->stats_vq);
-- 
cgit v1.2.3


From b2273be8d2df7b77165a70930064aeb9e8faebfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= <sjur.brandeland@stericsson.com>
Date: Sun, 24 Mar 2013 14:19:44 +1030
Subject: caif_virtio: Use vringh_notify_enable correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check on the correct return value from
vringh_notify_enable_kern(). It returns false if
more packets are available, not true.

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/caif/caif_virtio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index f6caa1eb4cd6..fb80765e258e 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -318,7 +318,7 @@ exit:
 
 		/* Really out of patckets? (stolen from virtio_net)*/
 		napi_complete(napi);
-		if (unlikely(vringh_notify_enable_kern(cfv->vr_rx)) &&
+		if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
 		    napi_schedule_prep(napi)) {
 			vringh_notify_disable_kern(cfv->vr_rx);
 			__napi_schedule(napi);
-- 
cgit v1.2.3


From a8c7687bf21603af6246e55cc58f98e42241bd01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= <sjur.brandeland@stericsson.com>
Date: Sun, 24 Mar 2013 14:19:59 +1030
Subject: caif_virtio: Check that vringh_config is not null
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check that vringh_config is not NULL before using it.

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/caif/caif_virtio.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers')

diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index fb80765e258e..316b184ea130 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -670,6 +670,10 @@ static int cfv_probe(struct virtio_device *vdev)
 	spin_lock_init(&cfv->tx_lock);
 
 	/* Get the RX virtio ring. This is a "host side vring". */
+	err = -ENODEV;
+	if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
+		goto err;
+
 	err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
 	if (err)
 		goto err;
-- 
cgit v1.2.3


From 1aef76e9c4c616c91233ece9850e89c91f3fd92a Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 2 Apr 2013 16:45:56 +1030
Subject: caif_virtio: fix error return code in cfv_create_genpool()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix to return a negative error code from the error handling
case instead of 0, as returned elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Acked-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/caif/caif_virtio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index 316b184ea130..0e3bede8b8a8 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -424,8 +424,10 @@ static int cfv_create_genpool(struct cfv_info *cfv)
 	cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
 	cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
 					   cfv->reserved_size);
-	if (!cfv->reserved_mem)
+	if (!cfv->reserved_mem) {
+		err = -ENOMEM;
 		goto err;
+	}
 
 	cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
 	return 0;
-- 
cgit v1.2.3


From 3826835ab8bb7eac47f14f279df2bd58ec2bb279 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 8 Apr 2013 16:13:59 +0930
Subject: virtio_console: make local symbols static

Those symbols only used within this file, and should be static.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Acked-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/virtio_console.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 6d59f166e0e9..f4f31fe88902 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -78,8 +78,8 @@ struct ports_driver_data {
 };
 static struct ports_driver_data pdrvdata;
 
-DEFINE_SPINLOCK(pdrvdata_lock);
-DECLARE_COMPLETION(early_console_added);
+static DEFINE_SPINLOCK(pdrvdata_lock);
+static DECLARE_COMPLETION(early_console_added);
 
 /* This struct holds information that's relevant only for console ports */
 struct console {
@@ -1198,7 +1198,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
 	return hvc_instantiate(0, 0, &hv_ops);
 }
 
-int init_port_console(struct port *port)
+static int init_port_console(struct port *port)
 {
 	int ret;
 
-- 
cgit v1.2.3


From 5c370194df9a97248ca69e05bfbd2d21b4886fe5 Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Mon, 8 Apr 2013 23:01:16 +0930
Subject: virtio-scsi: redo allocation of target data

virtio_scsi_target_state is now empty.  We will find new uses for it in
the next few patches, so this patch does not drop it completely.

And as James suggested, we use entries target_alloc and target_destroy
in the host template to allocate and destroy the virtio_scsi_target_state
of each target, attach this struct to scsi_target->hostdata. Now
we can get at it from the sdev with scsi_target(sdev)->hostdata.
No messing around with fixed size arrays and bulk memory allocation
and no need to pass in the maximum target size as a parameter because
everything should now happen dynamically.

Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 71 ++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index b53ba9e18f47..ffa03e8cb19c 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -75,8 +75,6 @@ struct virtio_scsi {
 
 	/* Get some buffers ready for event vq */
 	struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
-
-	struct virtio_scsi_target_state *tgt[];
 };
 
 static struct kmem_cache *virtscsi_cmd_cache;
@@ -530,6 +528,25 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
 	return virtscsi_tmf(vscsi, cmd);
 }
 
+static int virtscsi_target_alloc(struct scsi_target *starget)
+{
+	struct virtio_scsi_target_state *tgt =
+				kmalloc(sizeof(*tgt), GFP_KERNEL);
+	if (!tgt)
+		return -ENOMEM;
+
+	spin_lock_init(&tgt->tgt_lock);
+
+	starget->hostdata = tgt;
+	return 0;
+}
+
+static void virtscsi_target_destroy(struct scsi_target *starget)
+{
+	struct virtio_scsi_target_state *tgt = starget->hostdata;
+	kfree(tgt);
+}
+
 static struct scsi_host_template virtscsi_host_template = {
 	.module = THIS_MODULE,
 	.name = "Virtio SCSI HBA",
@@ -542,6 +559,8 @@ static struct scsi_host_template virtscsi_host_template = {
 	.can_queue = 1024,
 	.dma_boundary = UINT_MAX,
 	.use_clustering = ENABLE_CLUSTERING,
+	.target_alloc = virtscsi_target_alloc,
+	.target_destroy = virtscsi_target_destroy,
 };
 
 #define virtscsi_config_get(vdev, fld) \
@@ -568,20 +587,6 @@ static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 	virtscsi_vq->vq = vq;
 }
 
-static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
-	struct virtio_device *vdev)
-{
-	struct virtio_scsi_target_state *tgt;
-	gfp_t gfp_mask = GFP_KERNEL;
-
-	tgt = kmalloc(sizeof(*tgt), gfp_mask);
-	if (!tgt)
-		return NULL;
-
-	spin_lock_init(&tgt->tgt_lock);
-	return tgt;
-}
-
 static void virtscsi_scan(struct virtio_device *vdev)
 {
 	struct Scsi_Host *shost = (struct Scsi_Host *)vdev->priv;
@@ -591,28 +596,17 @@ static void virtscsi_scan(struct virtio_device *vdev)
 
 static void virtscsi_remove_vqs(struct virtio_device *vdev)
 {
-	struct Scsi_Host *sh = virtio_scsi_host(vdev);
-	struct virtio_scsi *vscsi = shost_priv(sh);
-	u32 i, num_targets;
-
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
-	num_targets = sh->max_id;
-	for (i = 0; i < num_targets; i++) {
-		kfree(vscsi->tgt[i]);
-		vscsi->tgt[i] = NULL;
-	}
-
 	vdev->config->del_vqs(vdev);
 }
 
 static int virtscsi_init(struct virtio_device *vdev,
-			 struct virtio_scsi *vscsi, int num_targets)
+			 struct virtio_scsi *vscsi)
 {
 	int err;
 	struct virtqueue *vqs[3];
-	u32 i;
 
 	vq_callback_t *callbacks[] = {
 		virtscsi_ctrl_done,
@@ -640,18 +634,6 @@ static int virtscsi_init(struct virtio_device *vdev,
 	if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
 		virtscsi_kick_event_all(vscsi);
 
-	for (i = 0; i < num_targets; i++) {
-		vscsi->tgt[i] = virtscsi_alloc_tgt(vdev);
-		if (!vscsi->tgt[i]) {
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-	err = 0;
-
-out:
-	if (err)
-		virtscsi_remove_vqs(vdev);
 	return err;
 }
 
@@ -663,12 +645,9 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	u32 sg_elems, num_targets;
 	u32 cmd_per_lun;
 
-	/* Allocate memory and link the structs together.  */
 	num_targets = virtscsi_config_get(vdev, max_target) + 1;
-	shost = scsi_host_alloc(&virtscsi_host_template,
-		sizeof(*vscsi)
-		+ num_targets * sizeof(struct virtio_scsi_target_state));
 
+	shost = scsi_host_alloc(&virtscsi_host_template, sizeof(*vscsi));
 	if (!shost)
 		return -ENOMEM;
 
@@ -678,7 +657,7 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	vscsi->vdev = vdev;
 	vdev->priv = shost;
 
-	err = virtscsi_init(vdev, vscsi, num_targets);
+	err = virtscsi_init(vdev, vscsi);
 	if (err)
 		goto virtscsi_init_failed;
 
@@ -735,7 +714,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
 	struct Scsi_Host *sh = virtio_scsi_host(vdev);
 	struct virtio_scsi *vscsi = shost_priv(sh);
 
-	return virtscsi_init(vdev, vscsi, sh->max_id);
+	return virtscsi_init(vdev, vscsi);
 }
 #endif
 
-- 
cgit v1.2.3


From 7f82b3c9158f3ce8eb9eaf0320a359de9ab6392c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Apr 2013 23:01:38 +0930
Subject: virtio-scsi: pass struct virtio_scsi to virtqueue completion function

This will be needed soon in order to retrieve the per-target
struct.

Cc: linux-scsi@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index ffa03e8cb19c..c23560c6a32e 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -104,7 +104,7 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
  *
  * Called with vq_lock held.
  */
-static void virtscsi_complete_cmd(void *buf)
+static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 {
 	struct virtio_scsi_cmd *cmd = buf;
 	struct scsi_cmnd *sc = cmd->sc;
@@ -165,7 +165,8 @@ static void virtscsi_complete_cmd(void *buf)
 	sc->scsi_done(sc);
 }
 
-static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtqueue *vq,
+			     void (*fn)(struct virtio_scsi *vscsi, void *buf))
 {
 	void *buf;
 	unsigned int len;
@@ -173,7 +174,7 @@ static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
 	do {
 		virtqueue_disable_cb(vq);
 		while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
-			fn(buf);
+			fn(vscsi, buf);
 	} while (!virtqueue_enable_cb(vq));
 }
 
@@ -184,11 +185,11 @@ static void virtscsi_req_done(struct virtqueue *vq)
 	unsigned long flags;
 
 	spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
-	virtscsi_vq_done(vq, virtscsi_complete_cmd);
+	virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd);
 	spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
 };
 
-static void virtscsi_complete_free(void *buf)
+static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
 {
 	struct virtio_scsi_cmd *cmd = buf;
 
@@ -205,7 +206,7 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
 	unsigned long flags;
 
 	spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags);
-	virtscsi_vq_done(vq, virtscsi_complete_free);
+	virtscsi_vq_done(vscsi, vq, virtscsi_complete_free);
 	spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
 };
 
@@ -329,7 +330,7 @@ static void virtscsi_handle_event(struct work_struct *work)
 	virtscsi_kick_event(vscsi, event_node);
 }
 
-static void virtscsi_complete_event(void *buf)
+static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
 {
 	struct virtio_scsi_event_node *event_node = buf;
 
@@ -344,7 +345,7 @@ static void virtscsi_event_done(struct virtqueue *vq)
 	unsigned long flags;
 
 	spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
-	virtscsi_vq_done(vq, virtscsi_complete_event);
+	virtscsi_vq_done(vscsi, vq, virtscsi_complete_event);
 	spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
 };
 
-- 
cgit v1.2.3


From 10f34f64d3a50912ae49c67c08c9162effdf546a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Apr 2013 23:02:07 +0930
Subject: virtio-scsi: push vq lock/unlock into virtscsi_vq_done

Avoid duplicated code in all of the callers.

Cc: linux-scsi@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index c23560c6a32e..dc2daec9a10d 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -165,28 +165,30 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 	sc->scsi_done(sc);
 }
 
-static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtqueue *vq,
+static void virtscsi_vq_done(struct virtio_scsi *vscsi,
+			     struct virtio_scsi_vq *virtscsi_vq,
 			     void (*fn)(struct virtio_scsi *vscsi, void *buf))
 {
 	void *buf;
 	unsigned int len;
+	unsigned long flags;
+	struct virtqueue *vq = virtscsi_vq->vq;
 
+	spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
 	do {
 		virtqueue_disable_cb(vq);
 		while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
 			fn(vscsi, buf);
 	} while (!virtqueue_enable_cb(vq));
+	spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
 }
 
 static void virtscsi_req_done(struct virtqueue *vq)
 {
 	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
 	struct virtio_scsi *vscsi = shost_priv(sh);
-	unsigned long flags;
 
-	spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
-	virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd);
-	spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
+	virtscsi_vq_done(vscsi, &vscsi->req_vq, virtscsi_complete_cmd);
 };
 
 static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
@@ -203,11 +205,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
 {
 	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
 	struct virtio_scsi *vscsi = shost_priv(sh);
-	unsigned long flags;
 
-	spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags);
-	virtscsi_vq_done(vscsi, vq, virtscsi_complete_free);
-	spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
+	virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
 };
 
 static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@@ -342,11 +341,8 @@ static void virtscsi_event_done(struct virtqueue *vq)
 {
 	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
 	struct virtio_scsi *vscsi = shost_priv(sh);
-	unsigned long flags;
 
-	spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
-	virtscsi_vq_done(vscsi, vq, virtscsi_complete_event);
-	spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
+	virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
 };
 
 /**
-- 
cgit v1.2.3


From 9141a4ca0d9551729573042660e9bce83a01e0af Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 8 Apr 2013 23:03:25 +0930
Subject: virtio-scsi: introduce multiqueue support

This patch adds queue steering to virtio-scsi.  When a target is sent
multiple requests, we always drive them to the same queue so that FIFO
processing order is kept.  However, if a target was idle, we can choose
a queue arbitrarily.  In this case the queue is chosen according to the
current VCPU, so the driver expects the number of request queues to be
equal to the number of VCPUs.  This makes it easy and fast to select
the queue, and also lets the driver optimize the IRQ affinity for the
virtqueues (each virtqueue's affinity is set to the CPU that "owns"
the queue).

The speedup comes from improving cache locality and giving CPU affinity
to the virtqueues, which is why this scheme was selected.  Assuming that
the thread that is sending requests to the device is I/O-bound, it is
likely to be sleeping at the time the ISR is executed, and thus executing
the ISR on the same processor that sent the requests is cheap.

However, the kernel will not execute the ISR on the "best" processor
unless you explicitly set the affinity.  This is because in practice
you will have many such I/O-bound processes and thus many otherwise
idle processors.  Then the kernel will execute the ISR on a random
processor, rather than the one that is sending requests to the device.

The alternative to per-CPU virtqueues is per-target virtqueues.  To
achieve the same locality, we could dynamically choose the virtqueue's
affinity based on the CPU of the last task that sent a request.  This
is less appealing because we do not set the affinity directly---we only
provide a hint to the irqbalanced running in userspace.  Dynamically
changing the affinity only works if the userspace applies the hint
fast enough.

Cc: linux-scsi@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
Tested-by: Venkatesh Srinivas <venkateshs@google.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 282 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 254 insertions(+), 28 deletions(-)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index dc2daec9a10d..8dcdef0783db 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -22,12 +22,14 @@
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_scsi.h>
+#include <linux/cpu.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_cmnd.h>
 
 #define VIRTIO_SCSI_MEMPOOL_SZ 64
 #define VIRTIO_SCSI_EVENT_LEN 8
+#define VIRTIO_SCSI_VQ_BASE 2
 
 /* Command queue element */
 struct virtio_scsi_cmd {
@@ -59,22 +61,58 @@ struct virtio_scsi_vq {
 	struct virtqueue *vq;
 };
 
-/* Per-target queue state */
+/*
+ * Per-target queue state.
+ *
+ * This struct holds the data needed by the queue steering policy.  When a
+ * target is sent multiple requests, we need to drive them to the same queue so
+ * that FIFO processing order is kept.  However, if a target was idle, we can
+ * choose a queue arbitrarily.  In this case the queue is chosen according to
+ * the current VCPU, so the driver expects the number of request queues to be
+ * equal to the number of VCPUs.  This makes it easy and fast to select the
+ * queue, and also lets the driver optimize the IRQ affinity for the virtqueues
+ * (each virtqueue's affinity is set to the CPU that "owns" the queue).
+ *
+ * An interesting effect of this policy is that only writes to req_vq need to
+ * take the tgt_lock.  Read can be done outside the lock because:
+ *
+ * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
+ *   In that case, no other CPU is reading req_vq: even if they were in
+ *   virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
+ *
+ * - reads of req_vq only occur when the target is not idle (reqs != 0).
+ *   A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
+ *
+ * Similarly, decrements of reqs are never concurrent with writes of req_vq.
+ * Thus they can happen outside the tgt_lock, provided of course we make reqs
+ * an atomic_t.
+ */
 struct virtio_scsi_target_state {
-	/* Never held at the same time as vq_lock.  */
+	/* This spinlock never held at the same time as vq_lock. */
 	spinlock_t tgt_lock;
+
+	/* Count of outstanding requests. */
+	atomic_t reqs;
+
+	/* Currently active virtqueue for requests sent to this target. */
+	struct virtio_scsi_vq *req_vq;
 };
 
 /* Driver instance state */
 struct virtio_scsi {
 	struct virtio_device *vdev;
 
-	struct virtio_scsi_vq ctrl_vq;
-	struct virtio_scsi_vq event_vq;
-	struct virtio_scsi_vq req_vq;
-
 	/* Get some buffers ready for event vq */
 	struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
+
+	u32 num_queues;
+
+	/* If the affinity hint is set for virtqueues */
+	bool affinity_hint_set;
+
+	struct virtio_scsi_vq ctrl_vq;
+	struct virtio_scsi_vq event_vq;
+	struct virtio_scsi_vq req_vqs[];
 };
 
 static struct kmem_cache *virtscsi_cmd_cache;
@@ -109,6 +147,8 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 	struct virtio_scsi_cmd *cmd = buf;
 	struct scsi_cmnd *sc = cmd->sc;
 	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+	struct virtio_scsi_target_state *tgt =
+				scsi_target(sc->device)->hostdata;
 
 	dev_dbg(&sc->device->sdev_gendev,
 		"cmd %p response %u status %#02x sense_len %u\n",
@@ -163,6 +203,8 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 
 	mempool_free(cmd, virtscsi_cmd_pool);
 	sc->scsi_done(sc);
+
+	atomic_dec(&tgt->reqs);
 }
 
 static void virtscsi_vq_done(struct virtio_scsi *vscsi,
@@ -187,8 +229,42 @@ static void virtscsi_req_done(struct virtqueue *vq)
 {
 	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
 	struct virtio_scsi *vscsi = shost_priv(sh);
+	int index = vq->index - VIRTIO_SCSI_VQ_BASE;
+	struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
 
-	virtscsi_vq_done(vscsi, &vscsi->req_vq, virtscsi_complete_cmd);
+	/*
+	 * Read req_vq before decrementing the reqs field in
+	 * virtscsi_complete_cmd.
+	 *
+	 * With barriers:
+	 *
+	 * 	CPU #0			virtscsi_queuecommand_multi (CPU #1)
+	 * 	------------------------------------------------------------
+	 * 	lock vq_lock
+	 * 	read req_vq
+	 * 	read reqs (reqs = 1)
+	 * 	write reqs (reqs = 0)
+	 * 				increment reqs (reqs = 1)
+	 * 				write req_vq
+	 *
+	 * Possible reordering without barriers:
+	 *
+	 * 	CPU #0			virtscsi_queuecommand_multi (CPU #1)
+	 * 	------------------------------------------------------------
+	 * 	lock vq_lock
+	 * 	read reqs (reqs = 1)
+	 * 	write reqs (reqs = 0)
+	 * 				increment reqs (reqs = 1)
+	 * 				write req_vq
+	 * 	read (wrong) req_vq
+	 *
+	 * We do not need a full smp_rmb, because req_vq is required to get
+	 * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
+	 * in the virtqueue as the user token.
+	 */
+	smp_read_barrier_depends();
+
+	virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
 };
 
 static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
@@ -251,7 +327,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
 }
 
 static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
-						struct virtio_scsi_event *event)
+					    struct virtio_scsi_event *event)
 {
 	struct scsi_device *sdev;
 	struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
@@ -410,9 +486,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
 	return err;
 }
 
-static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
+				 struct virtio_scsi_vq *req_vq,
+				 struct scsi_cmnd *sc)
 {
-	struct virtio_scsi *vscsi = shost_priv(sh);
 	struct virtio_scsi_cmd *cmd;
 	int ret;
 
@@ -446,7 +523,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
 	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
 
-	if (virtscsi_kick_cmd(&vscsi->req_vq, cmd,
+	if (virtscsi_kick_cmd(req_vq, cmd,
 			      sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
 			      GFP_ATOMIC) == 0)
 		ret = 0;
@@ -457,6 +534,55 @@ out:
 	return ret;
 }
 
+static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
+					struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	struct virtio_scsi_target_state *tgt =
+				scsi_target(sc->device)->hostdata;
+
+	atomic_inc(&tgt->reqs);
+	return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
+}
+
+static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
+					       struct virtio_scsi_target_state *tgt)
+{
+	struct virtio_scsi_vq *vq;
+	unsigned long flags;
+	u32 queue_num;
+
+	spin_lock_irqsave(&tgt->tgt_lock, flags);
+
+	/*
+	 * The memory barrier after atomic_inc_return matches
+	 * the smp_read_barrier_depends() in virtscsi_req_done.
+	 */
+	if (atomic_inc_return(&tgt->reqs) > 1)
+		vq = ACCESS_ONCE(tgt->req_vq);
+	else {
+		queue_num = smp_processor_id();
+		while (unlikely(queue_num >= vscsi->num_queues))
+			queue_num -= vscsi->num_queues;
+
+		tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
+	}
+
+	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
+	return vq;
+}
+
+static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
+				       struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	struct virtio_scsi_target_state *tgt =
+				scsi_target(sc->device)->hostdata;
+	struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
+
+	return virtscsi_queuecommand(vscsi, req_vq, sc);
+}
+
 static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 {
 	DECLARE_COMPLETION_ONSTACK(comp);
@@ -533,6 +659,8 @@ static int virtscsi_target_alloc(struct scsi_target *starget)
 		return -ENOMEM;
 
 	spin_lock_init(&tgt->tgt_lock);
+	atomic_set(&tgt->reqs, 0);
+	tgt->req_vq = NULL;
 
 	starget->hostdata = tgt;
 	return 0;
@@ -544,12 +672,28 @@ static void virtscsi_target_destroy(struct scsi_target *starget)
 	kfree(tgt);
 }
 
-static struct scsi_host_template virtscsi_host_template = {
+static struct scsi_host_template virtscsi_host_template_single = {
 	.module = THIS_MODULE,
 	.name = "Virtio SCSI HBA",
 	.proc_name = "virtio_scsi",
-	.queuecommand = virtscsi_queuecommand,
 	.this_id = -1,
+	.queuecommand = virtscsi_queuecommand_single,
+	.eh_abort_handler = virtscsi_abort,
+	.eh_device_reset_handler = virtscsi_device_reset,
+
+	.can_queue = 1024,
+	.dma_boundary = UINT_MAX,
+	.use_clustering = ENABLE_CLUSTERING,
+	.target_alloc = virtscsi_target_alloc,
+	.target_destroy = virtscsi_target_destroy,
+};
+
+static struct scsi_host_template virtscsi_host_template_multi = {
+	.module = THIS_MODULE,
+	.name = "Virtio SCSI HBA",
+	.proc_name = "virtio_scsi",
+	.this_id = -1,
+	.queuecommand = virtscsi_queuecommand_multi,
 	.eh_abort_handler = virtscsi_abort,
 	.eh_device_reset_handler = virtscsi_device_reset,
 
@@ -577,6 +721,47 @@ static struct scsi_host_template virtscsi_host_template = {
 				  &__val, sizeof(__val)); \
 	})
 
+static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
+{
+	int i;
+	int cpu;
+
+	/* In multiqueue mode, when the number of cpu is equal
+	 * to the number of request queues, we let the qeueues
+	 * to be private to one cpu by setting the affinity hint
+	 * to eliminate the contention.
+	 */
+	if ((vscsi->num_queues == 1 ||
+	     vscsi->num_queues != num_online_cpus()) && affinity) {
+		if (vscsi->affinity_hint_set)
+			affinity = false;
+		else
+			return;
+	}
+
+	if (affinity) {
+		i = 0;
+		for_each_online_cpu(cpu) {
+			virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
+			i++;
+		}
+
+		vscsi->affinity_hint_set = true;
+	} else {
+		for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
+			virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
+
+		vscsi->affinity_hint_set = false;
+	}
+}
+
+static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
+{
+	get_online_cpus();
+	__virtscsi_set_affinity(vscsi, affinity);
+	put_online_cpus();
+}
+
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 			     struct virtqueue *vq)
 {
@@ -593,6 +778,11 @@ static void virtscsi_scan(struct virtio_device *vdev)
 
 static void virtscsi_remove_vqs(struct virtio_device *vdev)
 {
+	struct Scsi_Host *sh = virtio_scsi_host(vdev);
+	struct virtio_scsi *vscsi = shost_priv(sh);
+
+	virtscsi_set_affinity(vscsi, false);
+
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
@@ -603,27 +793,43 @@ static int virtscsi_init(struct virtio_device *vdev,
 			 struct virtio_scsi *vscsi)
 {
 	int err;
-	struct virtqueue *vqs[3];
+	u32 i;
+	u32 num_vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct virtqueue **vqs;
+
+	num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
+	vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
+	callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
+	names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
+
+	if (!callbacks || !vqs || !names) {
+		err = -ENOMEM;
+		goto out;
+	}
 
-	vq_callback_t *callbacks[] = {
-		virtscsi_ctrl_done,
-		virtscsi_event_done,
-		virtscsi_req_done
-	};
-	const char *names[] = {
-		"control",
-		"event",
-		"request"
-	};
+	callbacks[0] = virtscsi_ctrl_done;
+	callbacks[1] = virtscsi_event_done;
+	names[0] = "control";
+	names[1] = "event";
+	for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
+		callbacks[i] = virtscsi_req_done;
+		names[i] = "request";
+	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
 	if (err)
-		return err;
+		goto out;
 
 	virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
 	virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
-	virtscsi_init_vq(&vscsi->req_vq, vqs[2]);
+	for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
+		virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
+				 vqs[i]);
+
+	virtscsi_set_affinity(vscsi, true);
 
 	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
 	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@@ -631,6 +837,14 @@ static int virtscsi_init(struct virtio_device *vdev,
 	if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
 		virtscsi_kick_event_all(vscsi);
 
+	err = 0;
+
+out:
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
+	if (err)
+		virtscsi_remove_vqs(vdev);
 	return err;
 }
 
@@ -641,10 +855,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	int err;
 	u32 sg_elems, num_targets;
 	u32 cmd_per_lun;
+	u32 num_queues;
+	struct scsi_host_template *hostt;
+
+	/* We need to know how many queues before we allocate. */
+	num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
 
 	num_targets = virtscsi_config_get(vdev, max_target) + 1;
 
-	shost = scsi_host_alloc(&virtscsi_host_template, sizeof(*vscsi));
+	if (num_queues == 1)
+		hostt = &virtscsi_host_template_single;
+	else
+		hostt = &virtscsi_host_template_multi;
+
+	shost = scsi_host_alloc(hostt,
+		sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
 	if (!shost)
 		return -ENOMEM;
 
@@ -652,6 +877,7 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	shost->sg_tablesize = sg_elems;
 	vscsi = shost_priv(shost);
 	vscsi->vdev = vdev;
+	vscsi->num_queues = num_queues;
 	vdev->priv = shost;
 
 	err = virtscsi_init(vdev, vscsi);
-- 
cgit v1.2.3


From 285e71ea6f3583a85e27cb2b9a7d8c35d4c0d558 Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Mon, 8 Apr 2013 23:05:49 +0930
Subject: virtio-scsi: reset virtqueue affinity when doing cpu hotplug

Add hot cpu notifier to reset the request virtqueue affinity
when doing cpu hotplug.

Cc: linux-scsi@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/scsi/virtio_scsi.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'drivers')

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 8dcdef0783db..2168258fb2c3 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -110,6 +110,9 @@ struct virtio_scsi {
 	/* If the affinity hint is set for virtqueues */
 	bool affinity_hint_set;
 
+	/* CPU hotplug notifier */
+	struct notifier_block nb;
+
 	struct virtio_scsi_vq ctrl_vq;
 	struct virtio_scsi_vq event_vq;
 	struct virtio_scsi_vq req_vqs[];
@@ -762,6 +765,23 @@ static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
 	put_online_cpus();
 }
 
+static int virtscsi_cpu_callback(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
+	switch(action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		__virtscsi_set_affinity(vscsi, true);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
 			     struct virtqueue *vq)
 {
@@ -884,6 +904,13 @@ static int virtscsi_probe(struct virtio_device *vdev)
 	if (err)
 		goto virtscsi_init_failed;
 
+	vscsi->nb.notifier_call = &virtscsi_cpu_callback;
+	err = register_hotcpu_notifier(&vscsi->nb);
+	if (err) {
+		pr_err("registering cpu notifier failed\n");
+		goto scsi_add_host_failed;
+	}
+
 	cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
 	shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
 	shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@@ -921,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
 
 	scsi_remove_host(shost);
 
+	unregister_hotcpu_notifier(&vscsi->nb);
+
 	virtscsi_remove_vqs(vdev);
 	scsi_host_put(shost);
 }
-- 
cgit v1.2.3


From 74ff582cd65ad01c45f1971feac28f23b7eb2687 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Mon, 15 Apr 2013 12:00:15 +0930
Subject: virtio: console: replace EMFILE with EBUSY for already-open port

Returning EMFILE (process has too many open files) is incorrect to
indicate a port is already open by another process.  Use EBUSY for that.

This does change what we report to userspace, but I believe userspace
can look at it this way: it gets EBUSY, a new error code, instead of
EMFILE.  It's still an error, and that's not changing.

Reported-by: Mateusz Guzik <mguzik@redhat.com>
Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/virtio_console.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index f4f31fe88902..5ee776595ced 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1036,7 +1036,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
 	spin_lock_irq(&port->inbuf_lock);
 	if (port->guest_connected) {
 		spin_unlock_irq(&port->inbuf_lock);
-		ret = -EMFILE;
+		ret = -EBUSY;
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 406a590ba105bfb7b67952f0a5f948e0d374e03e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:37 +0930
Subject: lguest: prepare to make SWITCHER_ADDR a variable.

We currently use the whole top PGD entry for the switcher, but that's
hitting the fixmap in some configurations (mainly, large NR_CPUS).
Introduce a variable, currently set to the constant.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c     | 18 ++++++++++--------
 drivers/lguest/x86/core.c |  4 ++--
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..099252301132 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,7 +20,7 @@
 #include <asm/asm-offsets.h>
 #include "lg.h"
 
-
+unsigned long switcher_addr;
 static struct vm_struct *switcher_vma;
 static struct page **switcher_page;
 
@@ -75,25 +75,27 @@ static __init int map_switcher(void)
 		}
 	}
 
+	switcher_addr = SWITCHER_ADDR;
+
 	/*
 	 * First we check that the Switcher won't overlap the fixmap area at
 	 * the top of memory.  It's currently nowhere near, but it could have
 	 * very strange effects if it ever happened.
 	 */
-	if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
+	if (switcher_addr + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
 		err = -ENOMEM;
 		printk("lguest: mapping switcher would thwack fixmap\n");
 		goto free_pages;
 	}
 
 	/*
-	 * Now we reserve the "virtual memory area" we want: 0xFFC00000
-	 * (SWITCHER_ADDR).  We might not get it in theory, but in practice
-	 * it's worked so far.  The end address needs +1 because __get_vm_area
-	 * allocates an extra guard page, so we need space for that.
+	 * Now we reserve the "virtual memory area" we want.  We might
+	 * not get it in theory, but in practice it's worked so far.
+	 * The end address needs +1 because __get_vm_area allocates an
+	 * extra guard page, so we need space for that.
 	 */
 	switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
-				     VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+				     VM_ALLOC, switcher_addr, switcher_addr
 				     + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
 	if (!switcher_vma) {
 		err = -ENOMEM;
@@ -103,7 +105,7 @@ static __init int map_switcher(void)
 
 	/*
 	 * This code actually sets up the pages we've allocated to appear at
-	 * SWITCHER_ADDR.  map_vm_area() takes the vma we allocated above, the
+	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
 	 * kind of pages we're mapping (kernel pages), and a pointer to our
 	 * array of struct pages.  It increments that pointer, but we don't
 	 * care.
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1844d5..20fae765d600 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,14 @@ static struct {
 /* Offset from where switcher.S was compiled to where we've copied it */
 static unsigned long switcher_offset(void)
 {
-	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+	return switcher_addr - (unsigned long)start_switcher_text;
 }
 
 /* This cpu's struct lguest_pages. */
 static struct lguest_pages *lguest_pages(unsigned int cpu)
 {
 	return &(((struct lguest_pages *)
-		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+		  (switcher_addr + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
 }
 
 static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-- 
cgit v1.2.3


From 68a644d734e61f38b686cb755bd2a3f43d9372f4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:37 +0930
Subject: lguest: check vaddr not pgd for Switcher protection.

We currently assume that the Switcher the top pgd; we want to remove
this assumption, so check that vaddr is OK, rather then checking pgd
index.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 3b62be160a6e..a2454a24a10c 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -95,13 +95,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
 {
 	unsigned int index = pgd_index(vaddr);
 
-#ifndef CONFIG_X86_PAE
-	/* We kill any Guest trying to touch the Switcher addresses. */
-	if (index >= SWITCHER_PGD_INDEX) {
-		kill_guest(cpu, "attempt to access switcher pages");
-		index = 0;
-	}
-#endif
 	/* Return a pointer index'th pgd entry for the i'th page table. */
 	return &cpu->lg->pgdirs[i].pgdir[index];
 }
@@ -117,13 +110,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
 	unsigned int index = pmd_index(vaddr);
 	pmd_t *page;
 
-	/* We kill any Guest trying to touch the Switcher addresses. */
-	if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
-					index >= SWITCHER_PMD_INDEX) {
-		kill_guest(cpu, "attempt to access switcher pages");
-		index = 0;
-	}
-
 	/* You should never call this if the PGD entry wasn't valid */
 	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
 	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -323,6 +309,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	pmd_t gpmd;
 #endif
 
+	/* We never demand page the Switcher, so trying is a mistake. */
+	if (vaddr >= switcher_addr)
+		return false;
+
 	/* First step: get the top-level Guest page table entry. */
 	if (unlikely(cpu->linear_pages)) {
 		/* Faking up a linear mapping. */
@@ -495,10 +485,14 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 {
 	pgd_t *spgd;
 	unsigned long flags;
-
 #ifdef CONFIG_X86_PAE
 	pmd_t *spmd;
 #endif
+
+	/* You can't put your stack in the Switcher! */
+	if (vaddr >= switcher_addr)
+		return false;
+
 	/* Look at the current top level entry: is it present? */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
 	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
@@ -897,6 +891,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
 void guest_set_pte(struct lg_cpu *cpu,
 		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
 {
+	/* We don't let you remap the Switcher; we need it to get back! */
+	if (vaddr >= switcher_addr) {
+		kill_guest(cpu, "attempt to set pte into Switcher pages");
+		return;
+	}
+
 	/*
 	 * Kernel mappings must be changed on all top levels.  Slow, but doesn't
 	 * happen often.
@@ -995,12 +995,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
 	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
 	 * Switcher mappings, so check that now.
 	 */
-#ifdef CONFIG_X86_PAE
-	if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
-		pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
-#else
-	if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-#endif
+	if (cpu->lg->kernel_address >= switcher_addr)
 		kill_guest(cpu, "bad kernel address %#lx",
 				 cpu->lg->kernel_address);
 }
-- 
cgit v1.2.3


From c215a8b9eb17739c01d59faa7db9d1ef162a82a8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:37 +0930
Subject: lguest: remove RESERVE_MEM constant.

We can use switcher_addr directly.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a2454a24a10c..27cbb186a911 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -63,10 +63,8 @@
  */
 #ifdef CONFIG_X86_PAE
 #define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1)
-#define RESERVE_MEM 		2U
 #define CHECK_GPGD_MASK		_PAGE_PRESENT
 #else
-#define RESERVE_MEM 		4U
 #define CHECK_GPGD_MASK		_PAGE_TABLE
 #endif
 
@@ -977,15 +975,21 @@ int init_guest_pagetable(struct lguest *lg)
 /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
 void page_table_guest_data_init(struct lg_cpu *cpu)
 {
+	/*
+	 * We tell the Guest that it can't use the virtual addresses
+	 * used by the Switcher.  This trick is equivalent to 4GB -
+	 * switcher_addr.
+	 */
+	u32 top = ~switcher_addr + 1;
+
 	/* We get the kernel address: above this is all kernel memory. */
 	if (get_user(cpu->lg->kernel_address,
-		&cpu->lg->lguest_data->kernel_address)
+		     &cpu->lg->lguest_data->kernel_address)
 		/*
-		 * We tell the Guest that it can't use the top 2 or 4 MB
-		 * of virtual addresses used by the Switcher.
+		 * We tell the Guest that it can't use the top virtual
+		 * addresses (used by the Switcher).
 		 */
-		|| put_user(RESERVE_MEM * 1024 * 1024,
-			    &cpu->lg->lguest_data->reserve_mem)) {
+	    || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
 		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
 		return;
 	}
-- 
cgit v1.2.3


From 856c608827928d29f80605e85fc3f8f0ab3af4fb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:38 +0930
Subject: lguest: rename switcher_page to switcher_pages.

There is a single page with the Switcher in it, but it's followed by 2
pages per Host CPU.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c        | 24 ++++++++++++------------
 drivers/lguest/lg.h          |  2 +-
 drivers/lguest/page_tables.c | 12 ++++++------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 099252301132..211d8267992b 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -22,7 +22,7 @@
 
 unsigned long switcher_addr;
 static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
+static struct page **switcher_pages;
 
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
@@ -56,9 +56,9 @@ static __init int map_switcher(void)
 	 * We allocate an array of struct page pointers.  map_vm_area() wants
 	 * this, rather than just an array of pages.
 	 */
-	switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
-				GFP_KERNEL);
-	if (!switcher_page) {
+	switcher_pages = kmalloc(sizeof(switcher_pages[0])*TOTAL_SWITCHER_PAGES,
+				 GFP_KERNEL);
+	if (!switcher_pages) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -68,8 +68,8 @@ static __init int map_switcher(void)
 	 * so we make sure they're zeroed.
 	 */
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!switcher_page[i]) {
+		switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!switcher_pages[i]) {
 			err = -ENOMEM;
 			goto free_some_pages;
 		}
@@ -110,7 +110,7 @@ static __init int map_switcher(void)
 	 * array of struct pages.  It increments that pointer, but we don't
 	 * care.
 	 */
-	pagep = switcher_page;
+	pagep = switcher_pages;
 	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
 	if (err) {
 		printk("lguest: map_vm_area failed: %i\n", err);
@@ -135,8 +135,8 @@ free_pages:
 	i = TOTAL_SWITCHER_PAGES;
 free_some_pages:
 	for (--i; i >= 0; i--)
-		__free_pages(switcher_page[i], 0);
-	kfree(switcher_page);
+		__free_pages(switcher_pages[i], 0);
+	kfree(switcher_pages);
 out:
 	return err;
 }
@@ -151,8 +151,8 @@ static void unmap_switcher(void)
 	vunmap(switcher_vma->addr);
 	/* Now we just need to free the pages we copied the switcher into */
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-		__free_pages(switcher_page[i], 0);
-	kfree(switcher_page);
+		__free_pages(switcher_pages[i], 0);
+	kfree(switcher_pages);
 }
 
 /*H:032
@@ -326,7 +326,7 @@ static int __init init(void)
 		goto out;
 
 	/* Now we set up the pagetable implementation for the Guests. */
-	err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
+	err = init_pagetables(switcher_pages, SHARED_SWITCHER_PAGES);
 	if (err)
 		goto unmap;
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..8bf68c54ff7f 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -15,7 +15,7 @@
 #include <asm/lguest.h>
 
 void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
+int init_pagetables(struct page **switcher_pages, unsigned int pages);
 
 struct pgdir {
 	unsigned long gpgdir;
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 27cbb186a911..21685580eb9f 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -1084,7 +1084,7 @@ static void free_switcher_pte_pages(void)
  * Currently the Switcher is less than a page long, so "pages" is always 1.
  */
 static __init void populate_switcher_pte_page(unsigned int cpu,
-					      struct page *switcher_page[],
+					      struct page *switcher_pages[],
 					      unsigned int pages)
 {
 	unsigned int i;
@@ -1092,7 +1092,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 
 	/* The first entries are easy: they map the Switcher code. */
 	for (i = 0; i < pages; i++) {
-		set_pte(&pte[i], mk_pte(switcher_page[i],
+		set_pte(&pte[i], mk_pte(switcher_pages[i],
 				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 	}
 
@@ -1100,14 +1100,14 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 	i = pages + cpu*2;
 
 	/* First page (Guest registers) is writable from the Guest */
-	set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
+	set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]),
 			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
 
 	/*
 	 * The second page contains the "struct lguest_ro_state", and is
 	 * read-only.
 	 */
-	set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
+	set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]),
 			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 }
 
@@ -1128,7 +1128,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
  * At boot or module load time, init_pagetables() allocates and populates
  * the Switcher PTE page for each CPU.
  */
-__init int init_pagetables(struct page **switcher_page, unsigned int pages)
+__init int init_pagetables(struct page **switcher_pages, unsigned int pages)
 {
 	unsigned int i;
 
@@ -1138,7 +1138,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages)
 			free_switcher_pte_pages();
 			return -ENOMEM;
 		}
-		populate_switcher_pte_page(i, switcher_page, pages);
+		populate_switcher_pte_page(i, switcher_pages, pages);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 93a2cdff98243df06bafd3c4f3b31b38f0d0fe3e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:38 +0930
Subject: lguest: assume Switcher text is a single page.

ie. SHARED_SWITCHER_PAGES == 1.  It is well under a page, and it's a
minor simplification: it's nice to have *one* simplification in a
patch series!

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c        |  9 ++++++++-
 drivers/lguest/lg.h          |  2 +-
 drivers/lguest/page_tables.c | 21 ++++++++-------------
 drivers/lguest/x86/core.c    |  5 ++---
 4 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 211d8267992b..4209065b9b1e 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -52,6 +52,13 @@ static __init int map_switcher(void)
 	 * easy.
 	 */
 
+	/* We assume Switcher text fits into a single page. */
+	if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+		printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+		       end_switcher_text - start_switcher_text);
+		return -EINVAL;
+	}
+
 	/*
 	 * We allocate an array of struct page pointers.  map_vm_area() wants
 	 * this, rather than just an array of pages.
@@ -326,7 +333,7 @@ static int __init init(void)
 		goto out;
 
 	/* Now we set up the pagetable implementation for the Guests. */
-	err = init_pagetables(switcher_pages, SHARED_SWITCHER_PAGES);
+	err = init_pagetables(switcher_pages);
 	if (err)
 		goto unmap;
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 8bf68c54ff7f..4c3e532d50d6 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -15,7 +15,7 @@
 #include <asm/lguest.h>
 
 void free_pagetables(void);
-int init_pagetables(struct page **switcher_pages, unsigned int pages);
+int init_pagetables(struct page **switcher_pages);
 
 struct pgdir {
 	unsigned long gpgdir;
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 21685580eb9f..758466299b0d 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -1079,25 +1079,20 @@ static void free_switcher_pte_pages(void)
 
 /*H:520
  * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher code itself.
- *
- * Currently the Switcher is less than a page long, so "pages" is always 1.
+ * the CPU number and the "struct page"s for the Switcher and per-cpu pages.
  */
 static __init void populate_switcher_pte_page(unsigned int cpu,
-					      struct page *switcher_pages[],
-					      unsigned int pages)
+					      struct page *switcher_pages[])
 {
-	unsigned int i;
 	pte_t *pte = switcher_pte_page(cpu);
+	int i;
 
-	/* The first entries are easy: they map the Switcher code. */
-	for (i = 0; i < pages; i++) {
-		set_pte(&pte[i], mk_pte(switcher_pages[i],
+	/* The first entries maps the Switcher code. */
+	set_pte(&pte[0], mk_pte(switcher_pages[0],
 				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
-	}
 
 	/* The only other thing we map is this CPU's pair of pages. */
-	i = pages + cpu*2;
+	i = 1 + cpu*2;
 
 	/* First page (Guest registers) is writable from the Guest */
 	set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]),
@@ -1128,7 +1123,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
  * At boot or module load time, init_pagetables() allocates and populates
  * the Switcher PTE page for each CPU.
  */
-__init int init_pagetables(struct page **switcher_pages, unsigned int pages)
+__init int init_pagetables(struct page **switcher_pages)
 {
 	unsigned int i;
 
@@ -1138,7 +1133,7 @@ __init int init_pagetables(struct page **switcher_pages, unsigned int pages)
 			free_switcher_pte_pages();
 			return -ENOMEM;
 		}
-		populate_switcher_pte_page(i, switcher_pages, pages);
+		populate_switcher_pte_page(i, switcher_pages);
 	}
 	return 0;
 }
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 20fae765d600..f0a3347b6441 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -62,11 +62,10 @@ static unsigned long switcher_offset(void)
 	return switcher_addr - (unsigned long)start_switcher_text;
 }
 
-/* This cpu's struct lguest_pages. */
+/* This cpu's struct lguest_pages (after the Switcher text page) */
 static struct lguest_pages *lguest_pages(unsigned int cpu)
 {
-	return &(((struct lguest_pages *)
-		  (switcher_addr + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+	return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
 }
 
 static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-- 
cgit v1.2.3


From e1d12606f756bdb8328a66a2873dca6c46bcb4e5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:39 +0930
Subject: lguest: make check_gpte et. al return bool.

This is a bit neater: we can immediately return if a PTE/PGD/PMD entry
is invalid (which also kills the guest).  It means we don't risk using
invalid entries as we reshuffle the code.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 758466299b0d..f074f34acb86 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -259,26 +259,35 @@ static void release_pte(pte_t pte)
 }
 /*:*/
 
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
 {
 	if ((pte_flags(gpte) & _PAGE_PSE) ||
-	    pte_pfn(gpte) >= cpu->lg->pfn_limit)
+	    pte_pfn(gpte) >= cpu->lg->pfn_limit) {
 		kill_guest(cpu, "bad page table entry");
+		return false;
+	}
+	return true;
 }
 
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
 {
 	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-	   (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+	    (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
 		kill_guest(cpu, "bad page directory entry");
+		return false;
+	}
+	return true;
 }
 
 #ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
 {
 	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-	   (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+	    (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
 		kill_guest(cpu, "bad page middle directory entry");
+		return false;
+	}
+	return true;
 }
 #endif
 
@@ -336,7 +345,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 			return false;
 		}
 		/* We check that the Guest pgd is OK. */
-		check_gpgd(cpu, gpgd);
+		if (!check_gpgd(cpu, gpgd))
+			return false;
 		/*
 		 * And we copy the flags to the shadow PGD entry.  The page
 		 * number in the shadow PGD is the page we just allocated.
@@ -372,7 +382,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		}
 
 		/* We check that the Guest pmd is OK. */
-		check_gpmd(cpu, gpmd);
+		if (!check_gpmd(cpu, gpmd))
+			return false;
 
 		/*
 		 * And we copy the flags to the shadow PMD entry.  The page
@@ -421,7 +432,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	 * Check that the Guest PTE flags are OK, and the page number is below
 	 * the pfn_limit (ie. not mapping the Launcher binary).
 	 */
-	check_gpte(cpu, gpte);
+	if (!check_gpte(cpu, gpte))
+		return false;
 
 	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
 	gpte = pte_mkyoung(gpte);
@@ -857,7 +869,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
 			 * micro-benchmark.
 			 */
 			if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-				check_gpte(cpu, gpte);
+				if (!check_gpte(cpu, gpte))
+					return;
 				set_pte(spte,
 					gpte_to_spte(cpu, gpte,
 						pte_flags(gpte) & _PAGE_DIRTY));
-- 
cgit v1.2.3


From 17427e08faae3e63271a9c2d0edb6a22e5fbb54b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:39 +0930
Subject: lguest: extract shadow PTE walking / allocating.

We want a separate find_pte() function so we can call it for populating the
switcher PTE entries.

We can also use it in page_writable().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 170 +++++++++++++++++++++++++------------------
 1 file changed, 101 insertions(+), 69 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index f074f34acb86..009c717fda99 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -291,6 +291,88 @@ static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
 }
 #endif
 
+/*H:331
+ * This is the core routine to walk the shadow page tables and find the page
+ * table entry for a specific address.
+ *
+ * If allocate is set, then we allocate any missing levels, setting the flags
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
+ */
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+			int pgd_flags, int pmd_flags)
+{
+	pgd_t *spgd;
+	/* Mid level for PAE. */
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+#endif
+
+	/* Get top level entry. */
+	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
+	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
+		/* No shadow entry: allocate a new shadow PTE page. */
+		unsigned long ptepage;
+
+		/* If they didn't want us to allocate anything, stop. */
+		if (!allocate)
+			return NULL;
+
+		ptepage = get_zeroed_page(GFP_KERNEL);
+		/*
+		 * This is not really the Guest's fault, but killing it is
+		 * simple for this corner case.
+		 */
+		if (!ptepage) {
+			kill_guest(cpu, "out of memory allocating pte page");
+			return NULL;
+		}
+		/*
+		 * And we copy the flags to the shadow PGD entry.  The page
+		 * number in the shadow PGD is the page we just allocated.
+		 */
+		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
+	}
+
+	/*
+	 * Intel's Physical Address Extension actually uses three levels of
+	 * page tables, so we need to look in the mid-level.
+	 */
+#ifdef CONFIG_X86_PAE
+	/* Now look at the mid-level shadow entry. */
+	spmd = spmd_addr(cpu, *spgd, vaddr);
+
+	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
+		/* No shadow entry: allocate a new shadow PTE page. */
+		unsigned long ptepage;
+
+		/* If they didn't want us to allocate anything, stop. */
+		if (!allocate)
+			return NULL;
+
+		ptepage = get_zeroed_page(GFP_KERNEL);
+
+		/*
+		 * This is not really the Guest's fault, but killing it is
+		 * simple for this corner case.
+		 */
+		if (!ptepage) {
+			kill_guest(cpu, "out of memory allocating pmd page");
+			return NULL;
+		}
+
+		/*
+		 * And we copy the flags to the shadow PMD entry.  The page
+		 * number in the shadow PMD is the page we just allocated.
+		 */
+		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+	}
+#endif
+
+	/* Get the pointer to the shadow PTE entry we're going to set. */
+	return spte_addr(cpu, *spgd, vaddr);
+}
+
 /*H:330
  * (i) Looking up a page table entry when the Guest faults.
  *
@@ -304,17 +386,11 @@ static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
  */
 bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 {
-	pgd_t gpgd;
-	pgd_t *spgd;
 	unsigned long gpte_ptr;
 	pte_t gpte;
 	pte_t *spte;
-
-	/* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
 	pmd_t gpmd;
-#endif
+	pgd_t gpgd;
 
 	/* We never demand page the Switcher, so trying is a mistake. */
 	if (vaddr >= switcher_addr)
@@ -329,67 +405,31 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		/* Toplevel not present?  We can't map it in. */
 		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
 			return false;
-	}
 
-	/* Now look at the matching shadow entry. */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
+		/* 
+		 * This kills the Guest if it has weird flags or tries to
+		 * refer to a "physical" address outside the bounds.
 		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
-			return false;
-		}
-		/* We check that the Guest pgd is OK. */
 		if (!check_gpgd(cpu, gpgd))
 			return false;
-		/*
-		 * And we copy the flags to the shadow PGD entry.  The page
-		 * number in the shadow PGD is the page we just allocated.
-		 */
-		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
 	}
 
+	/* This "mid-level" entry is only used for non-linear, PAE mode. */
+	gpmd = __pmd(_PAGE_TABLE);
+
 #ifdef CONFIG_X86_PAE
-	if (unlikely(cpu->linear_pages)) {
-		/* Faking up a linear mapping. */
-		gpmd = __pmd(_PAGE_TABLE);
-	} else {
+	if (likely(!cpu->linear_pages)) {
 		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
 		/* Middle level not present?  We can't map it in. */
 		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
 			return false;
-	}
-
-	/* Now look at the matching shadow entry. */
-	spmd = spmd_addr(cpu, *spgd, vaddr);
-
-	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
 
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
+		/* 
+		 * This kills the Guest if it has weird flags or tries to
+		 * refer to a "physical" address outside the bounds.
 		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
-			return false;
-		}
-
-		/* We check that the Guest pmd is OK. */
 		if (!check_gpmd(cpu, gpmd))
 			return false;
-
-		/*
-		 * And we copy the flags to the shadow PMD entry.  The page
-		 * number in the shadow PMD is the page we just allocated.
-		 */
-		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
 	}
 
 	/*
@@ -441,7 +481,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		gpte = pte_mkdirty(gpte);
 
 	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = spte_addr(cpu, *spgd, vaddr);
+	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+	if (!spte)
+		return false;
 
 	/*
 	 * If there was a valid shadow PTE entry here before, we release it.
@@ -493,33 +535,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
  */
 static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 {
-	pgd_t *spgd;
+	pte_t *spte;
 	unsigned long flags;
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
 
 	/* You can't put your stack in the Switcher! */
 	if (vaddr >= switcher_addr)
 		return false;
 
-	/* Look at the current top level entry: is it present? */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
+	/* If there's no shadow PTE, it's not writable. */
+	spte = find_spte(cpu, vaddr, false, 0, 0);
+	if (!spte)
 		return false;
 
-#ifdef CONFIG_X86_PAE
-	spmd = spmd_addr(cpu, *spgd, vaddr);
-	if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
-		return false;
-#endif
-
 	/*
 	 * Check the flags on the pte entry itself: it must be present and
 	 * writable.
 	 */
-	flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
-
+	flags = pte_flags(*spte);
 	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
 
-- 
cgit v1.2.3


From f1f394b1c33d93416c90f97e201d4d386c04af55 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:40 +0930
Subject: lguest: expost switcher_pages array (as lg_switcher_pages).

We will need this in page_table.c soon.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c | 25 +++++++++++++------------
 drivers/lguest/lg.h   |  1 +
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4209065b9b1e..b6c71c32308c 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -21,8 +21,8 @@
 #include "lg.h"
 
 unsigned long switcher_addr;
+struct page **lg_switcher_pages;
 static struct vm_struct *switcher_vma;
-static struct page **switcher_pages;
 
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
@@ -63,9 +63,10 @@ static __init int map_switcher(void)
 	 * We allocate an array of struct page pointers.  map_vm_area() wants
 	 * this, rather than just an array of pages.
 	 */
-	switcher_pages = kmalloc(sizeof(switcher_pages[0])*TOTAL_SWITCHER_PAGES,
-				 GFP_KERNEL);
-	if (!switcher_pages) {
+	lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
+				    * TOTAL_SWITCHER_PAGES,
+				    GFP_KERNEL);
+	if (!lg_switcher_pages) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -75,8 +76,8 @@ static __init int map_switcher(void)
 	 * so we make sure they're zeroed.
 	 */
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!switcher_pages[i]) {
+		lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!lg_switcher_pages[i]) {
 			err = -ENOMEM;
 			goto free_some_pages;
 		}
@@ -117,7 +118,7 @@ static __init int map_switcher(void)
 	 * array of struct pages.  It increments that pointer, but we don't
 	 * care.
 	 */
-	pagep = switcher_pages;
+	pagep = lg_switcher_pages;
 	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
 	if (err) {
 		printk("lguest: map_vm_area failed: %i\n", err);
@@ -142,8 +143,8 @@ free_pages:
 	i = TOTAL_SWITCHER_PAGES;
 free_some_pages:
 	for (--i; i >= 0; i--)
-		__free_pages(switcher_pages[i], 0);
-	kfree(switcher_pages);
+		__free_pages(lg_switcher_pages[i], 0);
+	kfree(lg_switcher_pages);
 out:
 	return err;
 }
@@ -158,8 +159,8 @@ static void unmap_switcher(void)
 	vunmap(switcher_vma->addr);
 	/* Now we just need to free the pages we copied the switcher into */
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-		__free_pages(switcher_pages[i], 0);
-	kfree(switcher_pages);
+		__free_pages(lg_switcher_pages[i], 0);
+	kfree(lg_switcher_pages);
 }
 
 /*H:032
@@ -333,7 +334,7 @@ static int __init init(void)
 		goto out;
 
 	/* Now we set up the pagetable implementation for the Guests. */
-	err = init_pagetables(switcher_pages);
+	err = init_pagetables(lg_switcher_pages);
 	if (err)
 		goto unmap;
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 4c3e532d50d6..9a345efa83e4 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -124,6 +124,7 @@ bool lguest_address_ok(const struct lguest *lg,
 		       unsigned long addr, unsigned long len);
 void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
 void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
 
 /*H:035
  * Using memory-copy operations like that is usually inconvient, so we
-- 
cgit v1.2.3


From 3412b6ae2924e068f9932f841bdea0f2d8424502 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:40 +0930
Subject: lguest: don't share Switcher PTE pages between guests.

We currently use the whole top PGD entry for the switcher, so we
simply share a fixed page of PTEs between all guests (actually, it's
one per Host CPU, to ensure isolation between guests).

Changes to a scheme where every guest has its own mappings.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c        |  10 +-
 drivers/lguest/lg.h          |   3 -
 drivers/lguest/page_tables.c | 260 ++++++++++++++++++-------------------------
 3 files changed, 107 insertions(+), 166 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index b6c71c32308c..7e1d7ee36478 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -333,15 +333,10 @@ static int __init init(void)
 	if (err)
 		goto out;
 
-	/* Now we set up the pagetable implementation for the Guests. */
-	err = init_pagetables(lg_switcher_pages);
-	if (err)
-		goto unmap;
-
 	/* We might need to reserve an interrupt vector. */
 	err = init_interrupts();
 	if (err)
-		goto free_pgtables;
+		goto unmap;
 
 	/* /dev/lguest needs to be registered. */
 	err = lguest_device_init();
@@ -356,8 +351,6 @@ static int __init init(void)
 
 free_interrupts:
 	free_interrupts();
-free_pgtables:
-	free_pagetables();
 unmap:
 	unmap_switcher();
 out:
@@ -369,7 +362,6 @@ static void __exit fini(void)
 {
 	lguest_device_remove();
 	free_interrupts();
-	free_pagetables();
 	unmap_switcher();
 
 	lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 9a345efa83e4..faac9fc6db22 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,9 +14,6 @@
 
 #include <asm/lguest.h>
 
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_pages);
-
 struct pgdir {
 	unsigned long gpgdir;
 	pgd_t *pgdir;
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 009c717fda99..1f48f2712f3a 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -62,20 +62,11 @@
  * will need the last pmd entry of the last pmd page.
  */
 #ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1)
 #define CHECK_GPGD_MASK		_PAGE_PRESENT
 #else
 #define CHECK_GPGD_MASK		_PAGE_TABLE
 #endif
 
-/*
- * We actually need a separate PTE page for each CPU.  Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
-
 /*H:320
  * The page table code is curly enough to need helper functions to keep it
  * clear and clean.  The kernel itself provides many of them; one advantage
@@ -714,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 			      int *blank_pgdir)
 {
 	unsigned int next;
-#ifdef CONFIG_X86_PAE
-	pmd_t *pmd_table;
-#endif
 
 	/*
 	 * We pick one entry at random to throw out.  Choosing the Least
@@ -731,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 		if (!cpu->lg->pgdirs[next].pgdir)
 			next = cpu->cpu_pgd;
 		else {
-#ifdef CONFIG_X86_PAE
 			/*
-			 * In PAE mode, allocate a pmd page and populate the
-			 * last pgd entry.
+			 * This is a blank page, so there are no kernel
+			 * mappings: caller must map the stack!
 			 */
-			pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
-			if (!pmd_table) {
-				free_page((long)cpu->lg->pgdirs[next].pgdir);
-				set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
-				next = cpu->cpu_pgd;
-			} else {
-				set_pgd(cpu->lg->pgdirs[next].pgdir +
-					SWITCHER_PGD_INDEX,
-					__pgd(__pa(pmd_table) | _PAGE_PRESENT));
-				/*
-				 * This is a blank page, so there are no kernel
-				 * mappings: caller must map the stack!
-				 */
-				*blank_pgdir = 1;
-			}
-#else
 			*blank_pgdir = 1;
-#endif
 		}
 	}
 	/* Record which Guest toplevel this shadows. */
@@ -764,6 +734,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 	return next;
 }
 
+/*H:501
+ * We do need the Switcher code mapped at all times, so we allocate that
+ * part of the Guest page table here, and populate it when we're about to run
+ * the guest.
+ */
+static bool allocate_switcher_mapping(struct lg_cpu *cpu)
+{
+	int i;
+
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+		if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+			       CHECK_GPGD_MASK, _PAGE_TABLE))
+			return false;
+	}
+	return true;
+}
+
 /*H:470
  * Finally, a routine which throws away everything: all PGD entries in all
  * the shadow page tables, including the Guest's kernel mappings.  This is used
@@ -774,28 +761,14 @@ static void release_all_pagetables(struct lguest *lg)
 	unsigned int i, j;
 
 	/* Every shadow pagetable this Guest has */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir) {
-#ifdef CONFIG_X86_PAE
-			pgd_t *spgd;
-			pmd_t *pmdpage;
-			unsigned int k;
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
+		if (!lg->pgdirs[i].pgdir)
+			continue;
 
-			/* Get the last pmd page. */
-			spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
-			pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
-			/*
-			 * And release the pmd entries of that pmd page,
-			 * except for the switcher pmd.
-			 */
-			for (k = 0; k < SWITCHER_PMD_INDEX; k++)
-				release_pmd(&pmdpage[k]);
-#endif
-			/* Every PGD entry except the Switcher at the top */
-			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
-				release_pgd(lg->pgdirs[i].pgdir + j);
-		}
+		/* Every PGD entry. */
+		for (j = 0; j < PTRS_PER_PGD; j++)
+			release_pgd(lg->pgdirs[i].pgdir + j);
+	}
 }
 
 /*
@@ -809,6 +782,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
 	release_all_pagetables(cpu->lg);
 	/* We need the Guest kernel stack mapped again. */
 	pin_stack_pages(cpu);
+	/* And we need Switcher allocated. */
+	if (!allocate_switcher_mapping(cpu))
+		kill_guest(cpu, "Cannot populate switcher mapping");
 }
 
 /*H:430
@@ -844,9 +820,15 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
 		newpgdir = new_pgdir(cpu, pgtable, &repin);
 	/* Change the current pgd index to the new one. */
 	cpu->cpu_pgd = newpgdir;
-	/* If it was completely blank, we map in the Guest kernel stack */
+	/*
+	 * If it was completely blank, we map in the Guest kernel stack and
+	 * the Switcher.
+	 */
 	if (repin)
 		pin_stack_pages(cpu);
+
+	if (!allocate_switcher_mapping(cpu))
+		kill_guest(cpu, "Cannot populate switcher mapping");
 }
 /*:*/
 
@@ -976,14 +958,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
 
-	if (idx >= SWITCHER_PGD_INDEX)
+	if (idx > PTRS_PER_PGD) {
+		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
+			   idx, PTRS_PER_PGD);
 		return;
+	}
 
 	/* If they're talking about a page table we have a shadow for... */
 	pgdir = find_pgdir(lg, gpgdir);
-	if (pgdir < ARRAY_SIZE(lg->pgdirs))
+	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
 		/* ... throw it away. */
 		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
+		/* That might have been the Switcher mapping, remap it. */
+		if (!allocate_switcher_mapping(&lg->cpus[0])) {
+			kill_guest(&lg->cpus[0],
+				   "Cannot populate switcher mapping");
+		}
+	}
 }
 
 #ifdef CONFIG_X86_PAE
@@ -1001,6 +992,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
  * we will populate on future faults.  The Guest doesn't have any actual
  * pagetables yet, so we set linear_pages to tell demand_page() to fake it
  * for the moment.
+ *
+ * We do need the Switcher to be mapped at all times, so we allocate that
+ * part of the Guest page table here.
  */
 int init_guest_pagetable(struct lguest *lg)
 {
@@ -1014,6 +1008,13 @@ int init_guest_pagetable(struct lguest *lg)
 
 	/* We start with a linear mapping until the initialize. */
 	cpu->linear_pages = true;
+
+	/* Allocate the page tables for the Switcher. */
+	if (!allocate_switcher_mapping(cpu)) {
+		release_all_pagetables(lg);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -1065,91 +1066,68 @@ void free_guest_pagetable(struct lguest *lg)
  * (vi) Mapping the Switcher when the Guest is about to run.
  *
  * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
- * for each CPU already set up, we just need to hook them in now we know which
- * Guest is about to run on this CPU.
+ * Guest (and not the pages for other CPUs).
+ *
+ * The pages have all been allocate
  */
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 {
-	pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
-	pte_t regs_pte;
+	unsigned long base, i;
+	struct page *percpu_switcher_page, *regs_page;
+	pte_t *pte;
 
-#ifdef CONFIG_X86_PAE
-	pmd_t switcher_pmd;
-	pmd_t *pmd_table;
-
-	switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
-			       PAGE_KERNEL_EXEC);
-
-	/* Figure out where the pmd page is, by reading the PGD, and converting
-	 * it to a virtual address. */
-	pmd_table = __va(pgd_pfn(cpu->lg->
-			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
-								<< PAGE_SHIFT);
-	/* Now write it into the shadow page table. */
-	set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
-#else
-	pgd_t switcher_pgd;
+	/* Code page should always be mapped, and executable. */
+	pte = find_spte(cpu, switcher_addr, false, 0, 0);
+	get_page(lg_switcher_pages[0]);
+	set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
 
-	/*
-	 * Make the last PGD entry for this Guest point to the Switcher's PTE
-	 * page for this CPU (with appropriate flags).
-	 */
-	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
+	/* Clear all the Switcher mappings for any other CPUs. */
+	/* FIXME: This is dumb: update only when Host CPU changes. */
+	for_each_possible_cpu(i) {
+		/* Get location of lguest_pages (indexed by Host CPU) */
+		base = switcher_addr + PAGE_SIZE
+			+ i * sizeof(struct lguest_pages);
 
-	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+		/* Get shadow PTE for first page (where we put guest regs). */
+		pte = find_spte(cpu, base, false, 0, 0);
+		set_pte(pte, __pte(0));
+
+		/* This is where we put R/O state. */
+		pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+		set_pte(pte, __pte(0));
+	}
 
-#endif
 	/*
-	 * We also change the Switcher PTE page.  When we're running the Guest,
-	 * we want the Guest's "regs" page to appear where the first Switcher
-	 * page for this CPU is.  This is an optimization: when the Switcher
-	 * saves the Guest registers, it saves them into the first page of this
-	 * CPU's "struct lguest_pages": if we make sure the Guest's register
-	 * page is already mapped there, we don't have to copy them out
-	 * again.
+	 * When we're running the Guest, we want the Guest's "regs" page to
+	 * appear where the first Switcher page for this CPU is.  This is an
+	 * optimization: when the Switcher saves the Guest registers, it saves
+	 * them into the first page of this CPU's "struct lguest_pages": if we
+	 * make sure the Guest's register page is already mapped there, we
+	 * don't have to copy them out again.
 	 */
-	regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
-	set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
-}
-/*:*/
-
-static void free_switcher_pte_pages(void)
-{
-	unsigned int i;
-
-	for_each_possible_cpu(i)
-		free_page((long)switcher_pte_page(i));
-}
-
-/*H:520
- * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher and per-cpu pages.
- */
-static __init void populate_switcher_pte_page(unsigned int cpu,
-					      struct page *switcher_pages[])
-{
-	pte_t *pte = switcher_pte_page(cpu);
-	int i;
-
-	/* The first entries maps the Switcher code. */
-	set_pte(&pte[0], mk_pte(switcher_pages[0],
-				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
-
-	/* The only other thing we map is this CPU's pair of pages. */
-	i = 1 + cpu*2;
-
-	/* First page (Guest registers) is writable from the Guest */
-	set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]),
-			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
+	/* Find the shadow PTE for this regs page. */
+	base = switcher_addr + PAGE_SIZE
+		+ raw_smp_processor_id() * sizeof(struct lguest_pages);
+	pte = find_spte(cpu, base, false, 0, 0);
+	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
+	get_page(regs_page);
+	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
 
 	/*
-	 * The second page contains the "struct lguest_ro_state", and is
-	 * read-only.
+	 * We map the second page of the struct lguest_pages read-only in
+	 * the Guest: the IDT, GDT and other things it's not supposed to
+	 * change.
 	 */
-	set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]),
-			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+	base += PAGE_SIZE;
+	pte = find_spte(cpu, base, false, 0, 0);
+
+	percpu_switcher_page
+		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
+	get_page(percpu_switcher_page);
+	set_pte(pte, mk_pte(percpu_switcher_page,
+			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
 }
+/*:*/
 
 /*
  * We've made it through the page table code.  Perhaps our tired brains are
@@ -1163,29 +1141,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
  *
  * There is just one file remaining in the Host.
  */
-
-/*H:510
- * At boot or module load time, init_pagetables() allocates and populates
- * the Switcher PTE page for each CPU.
- */
-__init int init_pagetables(struct page **switcher_pages)
-{
-	unsigned int i;
-
-	for_each_possible_cpu(i) {
-		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
-		if (!switcher_pte_page(i)) {
-			free_switcher_pte_pages();
-			return -ENOMEM;
-		}
-		populate_switcher_pte_page(i, switcher_pages);
-	}
-	return 0;
-}
-/*:*/
-
-/* Cleaning up simply involves freeing the PTE page for each CPU. */
-void free_pagetables(void)
-{
-	free_switcher_pte_pages();
-}
-- 
cgit v1.2.3


From 86935fc4ee4d95efe01b6c91cd5143fa4c38c02b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:41 +0930
Subject: lguest: map Switcher text whenever we allocate a new pagetable.

It's always to same, so no need to put in the PTE every time we're
about to run.  Keep a flag to track whether the pagetable has the
Switcher entries allocated, and when allocating always initialize the
Switcher text PTE.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lg.h          |  1 +
 drivers/lguest/page_tables.c | 42 ++++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index faac9fc6db22..005929a3fd52 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -16,6 +16,7 @@
 
 struct pgdir {
 	unsigned long gpgdir;
+	bool switcher_mapped;
 	pgd_t *pgdir;
 };
 
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 1f48f2712f3a..d1a5de45be02 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -736,18 +736,39 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 
 /*H:501
  * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here, and populate it when we're about to run
- * the guest.
+ * part of the Guest page table here.  We map the Switcher code immediately,
+ * but defer mapping of the guest register page and IDT/LDT etc page until
+ * just before we run the guest in map_switcher_in_guest().
+ *
+ * We *could* do this setup in map_switcher_in_guest(), but at that point
+ * we've interrupts disabled, and allocating pages like that is fraught: we
+ * can't sleep if we need to free up some memory.
  */
 static bool allocate_switcher_mapping(struct lg_cpu *cpu)
 {
 	int i;
 
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
-			       CHECK_GPGD_MASK, _PAGE_TABLE))
+		pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+				       CHECK_GPGD_MASK, _PAGE_TABLE);
+		if (!pte)
 			return false;
+
+		/*
+		 * Map the switcher page if not already there.  It might
+		 * already be there because we call allocate_switcher_mapping()
+		 * in guest_set_pgd() just in case it did discard our Switcher
+		 * mapping, but it probably didn't.
+		 */
+		if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
+			/* Get a reference to the Switcher page. */
+			get_page(lg_switcher_pages[0]);
+			/* Create a read-only, exectuable, kernel-style PTE */
+			set_pte(pte,
+				mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+		}
 	}
+	cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
 	return true;
 }
 
@@ -768,6 +789,7 @@ static void release_all_pagetables(struct lguest *lg)
 		/* Every PGD entry. */
 		for (j = 0; j < PTRS_PER_PGD; j++)
 			release_pgd(lg->pgdirs[i].pgdir + j);
+		lg->pgdirs[i].switcher_mapped = false;
 	}
 }
 
@@ -827,8 +849,10 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
 	if (repin)
 		pin_stack_pages(cpu);
 
-	if (!allocate_switcher_mapping(cpu))
-		kill_guest(cpu, "Cannot populate switcher mapping");
+	if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
+		if (!allocate_switcher_mapping(cpu))
+			kill_guest(cpu, "Cannot populate switcher mapping");
+	}
 }
 /*:*/
 
@@ -1076,10 +1100,8 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 	struct page *percpu_switcher_page, *regs_page;
 	pte_t *pte;
 
-	/* Code page should always be mapped, and executable. */
-	pte = find_spte(cpu, switcher_addr, false, 0, 0);
-	get_page(lg_switcher_pages[0]);
-	set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+	/* Switcher page should always be mapped! */
+	BUG_ON(!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped);
 
 	/* Clear all the Switcher mappings for any other CPUs. */
 	/* FIXME: This is dumb: update only when Host CPU changes. */
-- 
cgit v1.2.3


From 6d0cda93c0d3c8bb0a553047c10f114c88c8af89 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:41 +0930
Subject: lguest: cache last cpu we ran on.

This optimizes the frobbing of our Switcher map.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lg.h          |  1 +
 drivers/lguest/page_tables.c | 78 ++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 005929a3fd52..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -17,6 +17,7 @@
 struct pgdir {
 	unsigned long gpgdir;
 	bool switcher_mapped;
+	int last_host_cpu;
 	pgd_t *pgdir;
 };
 
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index d1a5de45be02..19611b0551cd 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
  * converted Guest pages when running the Guest.
 :*/
 
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
  * GPL v2 and any later version */
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -731,6 +731,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 	/* Release all the non-kernel mappings. */
 	flush_user_mappings(cpu->lg, next);
 
+	/* This hasn't run on any CPU at all. */
+	cpu->lg->pgdirs[next].last_host_cpu = -1;
+
 	return next;
 }
 
@@ -790,6 +793,7 @@ static void release_all_pagetables(struct lguest *lg)
 		for (j = 0; j < PTRS_PER_PGD; j++)
 			release_pgd(lg->pgdirs[i].pgdir + j);
 		lg->pgdirs[i].switcher_mapped = false;
+		lg->pgdirs[i].last_host_cpu = -1;
 	}
 }
 
@@ -1086,37 +1090,62 @@ void free_guest_pagetable(struct lguest *lg)
 		free_page((long)lg->pgdirs[i].pgdir);
 }
 
+/*H:481
+ * This clears the Switcher mappings for cpu #i.
+ */
+static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
+{
+	unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
+	pte_t *pte;
+
+	/* Clear the mappings for both pages. */
+	pte = find_spte(cpu, base, false, 0, 0);
+	release_pte(*pte);
+	set_pte(pte, __pte(0));
+
+	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+	release_pte(*pte);
+	set_pte(pte, __pte(0));
+}
+
 /*H:480
  * (vi) Mapping the Switcher when the Guest is about to run.
  *
- * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs).
+ * The Switcher and the two pages for this CPU need to be visible in the Guest
+ * (and not the pages for other CPUs).
  *
- * The pages have all been allocate
+ * The pages for the pagetables have all been allocated before: we just need
+ * to make sure the actual PTEs are up-to-date for the CPU we're about to run
+ * on.
  */
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 {
-	unsigned long base, i;
+	unsigned long base;
 	struct page *percpu_switcher_page, *regs_page;
 	pte_t *pte;
+	struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
 
-	/* Switcher page should always be mapped! */
-	BUG_ON(!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped);
-
-	/* Clear all the Switcher mappings for any other CPUs. */
-	/* FIXME: This is dumb: update only when Host CPU changes. */
-	for_each_possible_cpu(i) {
-		/* Get location of lguest_pages (indexed by Host CPU) */
-		base = switcher_addr + PAGE_SIZE
-			+ i * sizeof(struct lguest_pages);
+	/* Switcher page should always be mapped by now! */
+	BUG_ON(!pgdir->switcher_mapped);
 
-		/* Get shadow PTE for first page (where we put guest regs). */
-		pte = find_spte(cpu, base, false, 0, 0);
-		set_pte(pte, __pte(0));
+	/* 
+	 * Remember that we have two pages for each Host CPU, so we can run a
+	 * Guest on each CPU without them interfering.  We need to make sure
+	 * those pages are mapped correctly in the Guest, but since we usually
+	 * run on the same CPU, we cache that, and only update the mappings
+	 * when we move.
+	 */
+	if (pgdir->last_host_cpu == raw_smp_processor_id())
+		return;
 
-		/* This is where we put R/O state. */
-		pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-		set_pte(pte, __pte(0));
+	/* -1 means unknown so we remove everything. */
+	if (pgdir->last_host_cpu == -1) {
+		unsigned int i;
+		for_each_possible_cpu(i)
+			remove_switcher_percpu_map(cpu, i);
+	} else {
+		/* We know exactly what CPU mapping to remove. */
+		remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
 	}
 
 	/*
@@ -1140,18 +1169,17 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 	 * the Guest: the IDT, GDT and other things it's not supposed to
 	 * change.
 	 */
-	base += PAGE_SIZE;
-	pte = find_spte(cpu, base, false, 0, 0);
-
+	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
 	percpu_switcher_page
 		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
 	get_page(percpu_switcher_page);
 	set_pte(pte, mk_pte(percpu_switcher_page,
 			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+
+	pgdir->last_host_cpu = raw_smp_processor_id();
 }
-/*:*/
 
-/*
+/*H:490
  * We've made it through the page table code.  Perhaps our tired brains are
  * still processing the details, or perhaps we're simply glad it's over.
  *
-- 
cgit v1.2.3


From 6b39271746de131366a14bcf04f5740cdc4abdef Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 22 Apr 2013 14:10:42 +0930
Subject: lguest: map Switcher below fixmap.

Now we've adjusted all the code, we can simply set switcher_addr to
wherever it needs to go below the fixmaps, rather than asserting that
it should be so.

With large NR_CPUS and PAE, people were hitting the "mapping switcher
would thwack fixmap" message.

Reported-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 7e1d7ee36478..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -83,18 +83,13 @@ static __init int map_switcher(void)
 		}
 	}
 
-	switcher_addr = SWITCHER_ADDR;
-
 	/*
-	 * First we check that the Switcher won't overlap the fixmap area at
-	 * the top of memory.  It's currently nowhere near, but it could have
-	 * very strange effects if it ever happened.
+	 * We place the Switcher underneath the fixmap area, which is the
+	 * highest virtual address we can get.  This is important, since we
+	 * tell the Guest it can't access this memory, so we want its ceiling
+	 * as high as possible.
 	 */
-	if (switcher_addr + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
-		err = -ENOMEM;
-		printk("lguest: mapping switcher would thwack fixmap\n");
-		goto free_pages;
-	}
+	switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
 
 	/*
 	 * Now we reserve the "virtual memory area" we want.  We might
-- 
cgit v1.2.3


From 55257d72bd1c51f25106350f4983ec19f62ed1fa Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Mon, 29 Apr 2013 12:00:08 +0930
Subject: virtio-net: fill only rx queues which are being used

Due to MQ support we may allocate a whole bunch of rx queues but
never use them. With this patch we'll safe the space used by
the receive buffers until they are actually in use:

sh-4.2# free -h
             total       used       free     shared    buffers     cached
Mem:          490M        35M       455M         0B         0B       4.1M
-/+ buffers/cache:        31M       459M
Swap:           0B         0B         0B
sh-4.2# ethtool -L eth0 combined 8
sh-4.2# free -h
             total       used       free     shared    buffers     cached
Mem:          490M       162M       327M         0B         0B       4.1M
-/+ buffers/cache:       158M       331M
Swap:           0B         0B         0B

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/virtio_net.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d88d4366d9ac..b082e1c39031 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -581,7 +581,7 @@ static void refill_work(struct work_struct *work)
 	bool still_empty;
 	int i;
 
-	for (i = 0; i < vi->max_queue_pairs; i++) {
+	for (i = 0; i < vi->curr_queue_pairs; i++) {
 		struct receive_queue *rq = &vi->rq[i];
 
 		napi_disable(&rq->napi);
@@ -636,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
 	struct virtnet_info *vi = netdev_priv(dev);
 	int i;
 
-	for (i = 0; i < vi->max_queue_pairs; i++) {
+	for (i = 0; i < vi->curr_queue_pairs; i++) {
 		/* Make sure we have some buffers: if oom use wq. */
 		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
 			schedule_delayed_work(&vi->refill, 0);
@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 	struct scatterlist sg;
 	struct virtio_net_ctrl_mq s;
 	struct net_device *dev = vi->dev;
+	int i;
 
 	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
 		return 0;
@@ -912,8 +913,12 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
 			 queue_pairs);
 		return -EINVAL;
-	} else
+	} else {
+		for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
+			if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+				schedule_delayed_work(&vi->refill, 0);
 		vi->curr_queue_pairs = queue_pairs;
+	}
 
 	return 0;
 }
@@ -1566,7 +1571,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	for (i = 0; i < vi->max_queue_pairs; i++) {
+	for (i = 0; i < vi->curr_queue_pairs; i++) {
 		try_fill_recv(&vi->rq[i], GFP_KERNEL);
 
 		/* If we didn't even get one input buffer, we're useless. */
@@ -1690,7 +1695,7 @@ static int virtnet_restore(struct virtio_device *vdev)
 
 	netif_device_attach(vi->dev);
 
-	for (i = 0; i < vi->max_queue_pairs; i++)
+	for (i = 0; i < vi->curr_queue_pairs; i++)
 		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
 			schedule_delayed_work(&vi->refill, 0);
 
-- 
cgit v1.2.3


From c2ecd51531c881c8d47d77ea30395f7f03d42da3 Mon Sep 17 00:00:00 2001
From: Cosmin Paraschiv <csmnprschv@gmail.com>
Date: Tue, 30 Apr 2013 09:23:09 +0930
Subject: lguest: improve code readability in lg_cpu_start.

Make the container_of call friendlier and fix some comment slip-ups.

Signed-off-by: Cosmin Paraschiv <csmnprschv@gmail.com>
Cc: Daniel Baluta <dbaluta@ixiacom.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lguest_user.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
  */
 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 {
-	/* We have a limited number the number of CPUs in the lguest struct. */
+	/* We have a limited number of CPUs in the lguest struct. */
 	if (id >= ARRAY_SIZE(cpu->lg->cpus))
 		return -EINVAL;
 
 	/* Set up this CPU's id, and pointer back to the lguest struct. */
 	cpu->id = id;
-	cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+	cpu->lg = container_of(cpu, struct lguest, cpus[id]);
 	cpu->lg->nr_cpus++;
 
 	/* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	if (!cpu->regs_page)
 		return -ENOMEM;
 
-	/* We actually put the registers at the bottom of the page. */
+	/* We actually put the registers at the end of the page. */
 	cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
 
 	/*
-- 
cgit v1.2.3


From 01d779a14ef800b74684d9692add4944df052461 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= <sjur.brandeland@stericsson.com>
Date: Wed, 1 May 2013 11:57:50 +0930
Subject: caif_virtio: Remove bouncing email addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove our (soon to be) bouncing email addresses,
and update Dmitri's address.
Dmitry will take over as maintainer for CAIF from now on.

Cc: Vikram Arv <vikram.arv@stericsson.com>
Cc: Dmitry Tarnyagin <dmitry.tarnyagin@stericsson.com>
Cc: Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Dmity Tarnyagin <dmitry.tarnyagin@lockless.no>
---
 drivers/net/caif/caif_virtio.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index 0e3bede8b8a8..b9ed1288ce2d 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -1,8 +1,8 @@
 /*
  * Copyright (C) ST-Ericsson AB 2013
- * Authors: Vicram Arv / vikram.arv@stericsson.com,
- *	    Dmitry Tarnyagin / dmitry.tarnyagin@stericsson.com
- *	    Sjur Brendeland / sjur.brandeland@stericsson.com
+ * Authors: Vicram Arv
+ *	    Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
+ *	    Sjur Brendeland
  * License terms: GNU General Public License (GPL) version 2
  */
 #include <linux/module.h>
@@ -23,8 +23,8 @@
 #include <linux/virtio_config.h>
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Vicram Arv <vikram.arv@stericsson.com>");
-MODULE_AUTHOR("Sjur Brendeland <sjur.brandeland@stericsson.com>");
+MODULE_AUTHOR("Vicram Arv");
+MODULE_AUTHOR("Sjur Brendeland");
 MODULE_DESCRIPTION("Virtio CAIF Driver");
 
 /* NAPI schedule quota */
-- 
cgit v1.2.3