aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/s390/00-INDEX2
-rw-r--r--Documentation/s390/vfio-ccw.txt303
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/s390/Kconfig10
-rw-r--r--arch/s390/include/asm/cio.h18
-rw-r--r--arch/s390/include/asm/isc.h1
-rw-r--r--drivers/iommu/Kconfig8
-rw-r--r--drivers/s390/cio/Makefile3
-rw-r--r--drivers/s390/cio/cio.c69
-rw-r--r--drivers/s390/cio/cio.h1
-rw-r--r--drivers/s390/cio/device_fsm.c54
-rw-r--r--drivers/s390/cio/vfio_ccw_cp.c842
-rw-r--r--drivers/s390/cio/vfio_ccw_cp.h42
-rw-r--r--drivers/s390/cio/vfio_ccw_drv.c308
-rw-r--r--drivers/s390/cio/vfio_ccw_fsm.c207
-rw-r--r--drivers/s390/cio/vfio_ccw_ops.c447
-rw-r--r--drivers/s390/cio/vfio_ccw_private.h96
-rw-r--r--include/uapi/linux/vfio.h18
-rw-r--r--include/uapi/linux/vfio_ccw.h24
19 files changed, 2417 insertions, 46 deletions
diff --git a/Documentation/s390/00-INDEX b/Documentation/s390/00-INDEX
index 9189535f6cd2..317f0378ae01 100644
--- a/Documentation/s390/00-INDEX
+++ b/Documentation/s390/00-INDEX
@@ -22,5 +22,7 @@ qeth.txt
- HiperSockets Bridge Port Support.
s390dbf.txt
- information on using the s390 debug feature.
+vfio-ccw.txt
+ information on the vfio-ccw I/O subchannel driver.
zfcpdump.txt
- information on the s390 SCSI dump tool.
diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.txt
new file mode 100644
index 000000000000..90b3dfead81b
--- /dev/null
+++ b/Documentation/s390/vfio-ccw.txt
@@ -0,0 +1,303 @@
+vfio-ccw: the basic infrastructure
+==================================
+
+Introduction
+------------
+
+Here we describe the vfio support for I/O subchannel devices for
+Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a
+virtual machine, while vfio is the means.
+
+Different than other hardware architectures, s390 has defined a unified
+I/O access method, which is so called Channel I/O. It has its own access
+patterns:
+- Channel programs run asynchronously on a separate (co)processor.
+- The channel subsystem will access any memory designated by the caller
+ in the channel program directly, i.e. there is no iommu involved.
+Thus when we introduce vfio support for these devices, we realize it
+with a mediated device (mdev) implementation. The vfio mdev will be
+added to an iommu group, so as to make itself able to be managed by the
+vfio framework. And we add read/write callbacks for special vfio I/O
+regions to pass the channel programs from the mdev to its parent device
+(the real I/O subchannel device) to do further address translation and
+to perform I/O instructions.
+
+This document does not intend to explain the s390 I/O architecture in
+every detail. More information/reference could be found here:
+- A good start to know Channel I/O in general:
+ https://en.wikipedia.org/wiki/Channel_I/O
+- s390 architecture:
+ s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+- The existing Qemu code which implements a simple emulated channel
+ subsystem could also be a good reference. It makes it easier to follow
+ the flow.
+ qemu/hw/s390x/css.c
+
+For vfio mediated device framework:
+- Documentation/vfio-mediated-device.txt
+
+Motivation of vfio-ccw
+----------------------
+
+Currently, a guest virtualized via qemu/kvm on s390 only sees
+paravirtualized virtio devices via the "Virtio Over Channel I/O
+(virtio-ccw)" transport. This makes virtio devices discoverable via
+standard operating system algorithms for handling channel devices.
+
+However this is not enough. On s390 for the majority of devices, which
+use the standard Channel I/O based mechanism, we also need to provide
+the functionality of passing through them to a Qemu virtual machine.
+This includes devices that don't have a virtio counterpart (e.g. tape
+drives) or that have specific characteristics which guests want to
+exploit.
+
+For passing a device to a guest, we want to use the same interface as
+everybody else, namely vfio. Thus, we would like to introduce vfio
+support for channel devices. And we would like to name this new vfio
+device "vfio-ccw".
+
+Access patterns of CCW devices
+------------------------------
+
+s390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the
+systems. Though the s390 hardware platform knows about a huge variety of
+different peripheral attachments like disk devices (aka. DASDs), tapes,
+communication controllers, etc. They can all be accessed by a well
+defined access method and they are presenting I/O completion a unified
+way: I/O interruptions.
+
+All I/O requires the use of channel command words (CCWs). A CCW is an
+instruction to a specialized I/O channel processor. A channel program is
+a sequence of CCWs which are executed by the I/O channel subsystem. To
+issue a channel program to the channel subsystem, it is required to
+build an operation request block (ORB), which can be used to point out
+the format of the CCW and other control information to the system. The
+operating system signals the I/O channel subsystem to begin executing
+the channel program with a SSCH (start sub-channel) instruction. The
+central processor is then free to proceed with non-I/O instructions
+until interrupted. The I/O completion result is received by the
+interrupt handler in the form of interrupt response block (IRB).
+
+Back to vfio-ccw, in short:
+- ORBs and channel programs are built in guest kernel (with guest
+ physical addresses).
+- ORBs and channel programs are passed to the host kernel.
+- Host kernel translates the guest physical addresses to real addresses
+ and starts the I/O with issuing a privileged Channel I/O instruction
+ (e.g SSCH).
+- channel programs run asynchronously on a separate processor.
+- I/O completion will be signaled to the host with I/O interruptions.
+ And it will be copied as IRB to user space to pass it back to the
+ guest.
+
+Physical vfio ccw device and its child mdev
+-------------------------------------------
+
+As mentioned above, we realize vfio-ccw with a mdev implementation.
+
+Channel I/O does not have IOMMU hardware support, so the physical
+vfio-ccw device does not have an IOMMU level translation or isolation.
+
+Sub-channel I/O instructions are all privileged instructions, When
+handling the I/O instruction interception, vfio-ccw has the software
+policing and translation how the channel program is programmed before
+it gets sent to hardware.
+
+Within this implementation, we have two drivers for two types of
+devices:
+- The vfio_ccw driver for the physical subchannel device.
+ This is an I/O subchannel driver for the real subchannel device. It
+ realizes a group of callbacks and registers to the mdev framework as a
+ parent (physical) device. As a consequence, mdev provides vfio_ccw a
+ generic interface (sysfs) to create mdev devices. A vfio mdev could be
+ created by vfio_ccw then and added to the mediated bus. It is the vfio
+ device that added to an IOMMU group and a vfio group.
+ vfio_ccw also provides an I/O region to accept channel program
+ request from user space and store I/O interrupt result for user
+ space to retrieve. To notify user space an I/O completion, it offers
+ an interface to setup an eventfd fd for asynchronous signaling.
+
+- The vfio_mdev driver for the mediated vfio ccw device.
+ This is provided by the mdev framework. It is a vfio device driver for
+ the mdev that created by vfio_ccw.
+ It realize a group of vfio device driver callbacks, adds itself to a
+ vfio group, and registers itself to the mdev framework as a mdev
+ driver.
+ It uses a vfio iommu backend that uses the existing map and unmap
+ ioctls, but rather than programming them into an IOMMU for a device,
+ it simply stores the translations for use by later requests. This
+ means that a device programmed in a VM with guest physical addresses
+ can have the vfio kernel convert that address to process virtual
+ address, pin the page and program the hardware with the host physical
+ address in one step.
+ For a mdev, the vfio iommu backend will not pin the pages during the
+ VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database
+ of the iova<->vaddr mappings in this operation. And they export a
+ vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu
+ backend for the physical devices to pin and unpin pages by demand.
+
+Below is a high Level block diagram.
+
+ +-------------+
+ | |
+ | +---------+ | mdev_register_driver() +--------------+
+ | | Mdev | +<-----------------------+ |
+ | | bus | | | vfio_mdev.ko |
+ | | driver | +----------------------->+ |<-> VFIO user
+ | +---------+ | probe()/remove() +--------------+ APIs
+ | |
+ | MDEV CORE |
+ | MODULE |
+ | mdev.ko |
+ | +---------+ | mdev_register_device() +--------------+
+ | |Physical | +<-----------------------+ |
+ | | device | | | vfio_ccw.ko |<-> subchannel
+ | |interface| +----------------------->+ | device
+ | +---------+ | callback +--------------+
+ +-------------+
+
+The process of how these work together.
+1. vfio_ccw.ko drives the physical I/O subchannel, and registers the
+ physical device (with callbacks) to mdev framework.
+ When vfio_ccw probing the subchannel device, it registers device
+ pointer and callbacks to the mdev framework. Mdev related file nodes
+ under the device node in sysfs would be created for the subchannel
+ device, namely 'mdev_create', 'mdev_destroy' and
+ 'mdev_supported_types'.
+2. Create a mediated vfio ccw device.
+ Use the 'mdev_create' sysfs file, we need to manually create one (and
+ only one for our case) mediated device.
+3. vfio_mdev.ko drives the mediated ccw device.
+ vfio_mdev is also the vfio device drvier. It will probe the mdev and
+ add it to an iommu_group and a vfio_group. Then we could pass through
+ the mdev to a guest.
+
+vfio-ccw I/O region
+-------------------
+
+An I/O region is used to accept channel program request from user
+space and store I/O interrupt result for user space to retrieve. The
+defination of the region is:
+
+struct ccw_io_region {
+#define ORB_AREA_SIZE 12
+ __u8 orb_area[ORB_AREA_SIZE];
+#define SCSW_AREA_SIZE 12
+ __u8 scsw_area[SCSW_AREA_SIZE];
+#define IRB_AREA_SIZE 96
+ __u8 irb_area[IRB_AREA_SIZE];
+ __u32 ret_code;
+} __packed;
+
+While starting an I/O request, orb_area should be filled with the
+guest ORB, and scsw_area should be filled with the SCSW of the Virtual
+Subchannel.
+
+irb_area stores the I/O result.
+
+ret_code stores a return code for each access of the region.
+
+vfio-ccw patches overview
+-------------------------
+
+For now, our patches are rebased on the latest mdev implementation.
+vfio-ccw follows what vfio-pci did on the s390 paltform and uses
+vfio-iommu-type1 as the vfio iommu backend. It's a good start to launch
+the code review for vfio-ccw. Note that the implementation is far from
+complete yet; but we'd like to get feedback for the general
+architecture.
+
+* CCW translation APIs
+- Description:
+ These introduce a group of APIs (start with 'cp_') to do CCW
+ translation. The CCWs passed in by a user space program are
+ organized with their guest physical memory addresses. These APIs
+ will copy the CCWs into the kernel space, and assemble a runnable
+ kernel channel program by updating the guest physical addresses with
+ their corresponding host physical addresses.
+- Patches:
+ vfio: ccw: introduce channel program interfaces
+
+* vfio_ccw device driver
+- Description:
+ The following patches utilizes the CCW translation APIs and introduce
+ vfio_ccw, which is the driver for the I/O subchannel devices you want
+ to pass through.
+ vfio_ccw implements the following vfio ioctls:
+ VFIO_DEVICE_GET_INFO
+ VFIO_DEVICE_GET_IRQ_INFO
+ VFIO_DEVICE_GET_REGION_INFO
+ VFIO_DEVICE_RESET
+ VFIO_DEVICE_SET_IRQS
+ This provides an I/O region, so that the user space program can pass a
+ channel program to the kernel, to do further CCW translation before
+ issuing them to a real device.
+ This also provides the SET_IRQ ioctl to setup an event notifier to
+ notify the user space program the I/O completion in an asynchronous
+ way.
+- Patches:
+ vfio: ccw: basic implementation for vfio_ccw driver
+ vfio: ccw: introduce ccw_io_region
+ vfio: ccw: realize VFIO_DEVICE_GET_REGION_INFO ioctl
+ vfio: ccw: realize VFIO_DEVICE_RESET ioctl
+ vfio: ccw: realize VFIO_DEVICE_G(S)ET_IRQ_INFO ioctls
+
+The user of vfio-ccw is not limited to Qemu, while Qemu is definitely a
+good example to get understand how these patches work. Here is a little
+bit more detail how an I/O request triggered by the Qemu guest will be
+handled (without error handling).
+
+Explanation:
+Q1-Q7: Qemu side process.
+K1-K5: Kernel side process.
+
+Q1. Get I/O region info during initialization.
+Q2. Setup event notifier and handler to handle I/O completion.
+
+... ...
+
+Q3. Intercept a ssch instruction.
+Q4. Write the guest channel program and ORB to the I/O region.
+ K1. Copy from guest to kernel.
+ K2. Translate the guest channel program to a host kernel space
+ channel program, which becomes runnable for a real device.
+ K3. With the necessary information contained in the orb passed in
+ by Qemu, issue the ccwchain to the device.
+ K4. Return the ssch CC code.
+Q5. Return the CC code to the guest.
+
+... ...
+
+ K5. Interrupt handler gets the I/O result and write the result to
+ the I/O region.
+ K6. Signal Qemu to retrieve the result.
+Q6. Get the signal and event handler reads out the result from the I/O
+ region.
+Q7. Update the irb for the guest.
+
+Limitations
+-----------
+
+The current vfio-ccw implementation focuses on supporting basic commands
+needed to implement block device functionality (read/write) of DASD/ECKD
+device only. Some commands may need special handling in the future, for
+example, anything related to path grouping.
+
+DASD is a kind of storage device. While ECKD is a data recording format.
+More information for DASD and ECKD could be found here:
+https://en.wikipedia.org/wiki/Direct-access_storage_device
+https://en.wikipedia.org/wiki/Count_key_data
+
+Together with the corresponding work in Qemu, we can bring the passed
+through DASD/ECKD device online in a guest now and use it as a block
+device.
+
+Reference
+---------
+1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
+3. https://en.wikipedia.org/wiki/Channel_I/O
+4. Documentation/s390/cds.txt
+5. Documentation/vfio.txt
+6. Documentation/vfio-mediated-device.txt
diff --git a/MAINTAINERS b/MAINTAINERS
index 24136346fe58..2ad7588e8a13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10860,6 +10860,16 @@ W: http://www.ibm.com/developerworks/linux/linux390/
S: Supported
F: drivers/iommu/s390-iommu.c
+S390 VFIO-CCW DRIVER
+M: Cornelia Huck <cornelia.huck@de.ibm.com>
+M: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+L: linux-s390@vger.kernel.org
+L: kvm@vger.kernel.org
+S: Supported
+F: drivers/s390/cio/vfio_ccw*
+F: Documentation/s390/vfio-ccw.txt
+F: include/uapi/linux/vfio_ccw.h
+
S3C24XX SD/MMC Driver
M: Ben Dooks <ben-linux@fluff.org>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index afac5d89ad1f..ca2fe764be2d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -682,6 +682,16 @@ config EADM_SCH
To compile this driver as a module, choose M here: the
module will be called eadm_sch.
+config VFIO_CCW
+ def_tristate n
+ prompt "Support for VFIO-CCW subchannels"
+ depends on S390_CCW_IOMMU && VFIO_MDEV
+ help
+ This driver allows usage of I/O subchannels via VFIO-CCW.
+
+ To compile this driver as a module, choose M here: the
+ module will be called vfio_ccw.
+
endmenu
menu "Dump support"
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index f7ed88cc066e..7a38ca85190b 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -33,6 +33,24 @@ struct ccw1 {
__u32 cda;
} __attribute__ ((packed,aligned(8)));
+/**
+ * struct ccw0 - channel command word
+ * @cmd_code: command code
+ * @cda: data address
+ * @flags: flags, like IDA addressing, etc.
+ * @reserved: will be ignored
+ * @count: byte count
+ *
+ * The format-0 ccw structure.
+ */
+struct ccw0 {
+ __u8 cmd_code;
+ __u32 cda : 24;
+ __u8 flags;
+ __u8 reserved;
+ __u16 count;
+} __packed __aligned(8);
+
#define CCW_FLAG_DC 0x80
#define CCW_FLAG_CC 0x40
#define CCW_FLAG_SLI 0x20
diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h
index 68d7d68300f2..8a0b721a9b8d 100644
--- a/arch/s390/include/asm/isc.h
+++ b/arch/s390/include/asm/isc.h
@@ -16,6 +16,7 @@
#define CONSOLE_ISC 1 /* console I/O subchannel */
#define EADM_SCH_ISC 4 /* EADM subchannels */
#define CHSC_SCH_ISC 7 /* CHSC subchannels */
+#define VFIO_CCW_ISC IO_SCH_ISC /* VFIO-CCW I/O subchannels */
/* Adapter interrupts. */
#define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */
#define PCI_ISC 2 /* PCI I/O subchannels */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 37e204f3d9be..6ee3a25ae731 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -327,6 +327,14 @@ config S390_IOMMU
help
Support for the IOMMU API for s390 PCI devices.
+config S390_CCW_IOMMU
+ bool "S390 CCW IOMMU Support"
+ depends on S390 && CCW
+ select IOMMU_API
+ help
+ Enables bits of IOMMU API required by VFIO. The iommu_ops
+ is not implemented as it is not necessary for VFIO.
+
config MTK_IOMMU
bool "MTK IOMMU Support"
depends on ARM || ARM64
diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile
index 3ab9aedeb84a..bdf47526038a 100644
--- a/drivers/s390/cio/Makefile
+++ b/drivers/s390/cio/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o
qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o
obj-$(CONFIG_QDIO) += qdio.o
+
+vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o
+obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 1b350665c823..89216174fcbb 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -170,12 +170,14 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */
return ccode;
}
}
+EXPORT_SYMBOL_GPL(cio_start_key);
int
cio_start (struct subchannel *sch, struct ccw1 *cpa, __u8 lpm)
{
return cio_start_key(sch, cpa, lpm, PAGE_DEFAULT_KEY);
}
+EXPORT_SYMBOL_GPL(cio_start);
/*
* resume suspended I/O operation
@@ -208,6 +210,7 @@ cio_resume (struct subchannel *sch)
return -ENODEV;
}
}
+EXPORT_SYMBOL_GPL(cio_resume);
/*
* halt I/O operation
@@ -241,6 +244,7 @@ cio_halt(struct subchannel *sch)
return -ENODEV;
}
}
+EXPORT_SYMBOL_GPL(cio_halt);
/*
* Clear I/O operation
@@ -271,6 +275,7 @@ cio_clear(struct subchannel *sch)
return -ENODEV;
}
}
+EXPORT_SYMBOL_GPL(cio_clear);
/*
* Function: cio_cancel
@@ -308,7 +313,68 @@ cio_cancel (struct subchannel *sch)
return -ENODEV;
}
}
+EXPORT_SYMBOL_GPL(cio_cancel);
+/**
+ * cio_cancel_halt_clear - Cancel running I/O by performing cancel, halt
+ * and clear ordinally if subchannel is valid.
+ * @sch: subchannel on which to perform the cancel_halt_clear operation
+ * @iretry: the number of the times remained to retry the next operation
+ *
+ * This should be called repeatedly since halt/clear are asynchronous
+ * operations. We do one try with cio_cancel, three tries with cio_halt,
+ * 255 tries with cio_clear. The caller should initialize @iretry with
+ * the value 255 for its first call to this, and keep using the same
+ * @iretry in the subsequent calls until it gets a non -EBUSY return.
+ *
+ * Returns 0 if device now idle, -ENODEV for device not operational,
+ * -EBUSY if an interrupt is expected (either from halt/clear or from a
+ * status pending), and -EIO if out of retries.
+ */
+int cio_cancel_halt_clear(struct subchannel *sch, int *iretry)
+{
+ int ret;
+
+ if (cio_update_schib(sch))
+ return -ENODEV;
+ if (!sch->schib.pmcw.ena)
+ /* Not operational -> done. */
+ return 0;
+ /* Stage 1: cancel io. */
+ if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_HALT_PEND) &&
+ !(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
+ if (!scsw_is_tm(&sch->schib.scsw)) {
+ ret = cio_cancel(sch);
+ if (ret != -EINVAL)
+ return ret;
+ }
+ /*
+ * Cancel io unsuccessful or not applicable (transport mode).
+ * Continue with asynchronous instructions.
+ */
+ *iretry = 3; /* 3 halt retries. */
+ }
+ /* Stage 2: halt io. */
+ if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
+ if (*iretry) {
+ *iretry -= 1;
+ ret = cio_halt(sch);
+ if (ret != -EBUSY)
+ return (ret == 0) ? -EBUSY : ret;
+ }
+ /* Halt io unsuccessful. */
+ *iretry = 255; /* 255 clear retries. */
+ }
+ /* Stage 3: clear io. */
+ if (*iretry) {
+ *iretry -= 1;
+ ret = cio_clear(sch);
+ return (ret == 0) ? -EBUSY : ret;
+ }
+ /* Function was unsuccessful */
+ return -EIO;
+}
+EXPORT_SYMBOL_GPL(cio_cancel_halt_clear);
static void cio_apply_config(struct subchannel *sch, struct schib *schib)
{
@@ -382,6 +448,7 @@ int cio_commit_config(struct subchannel *sch)
}
return ret;
}
+EXPORT_SYMBOL_GPL(cio_commit_config);
/**
* cio_update_schib - Perform stsch and update schib if subchannel is valid.
@@ -987,6 +1054,7 @@ int cio_tm_start_key(struct subchannel *sch, struct tcw *tcw, u8 lpm, u8 key)
return cio_start_handle_notoper(sch, lpm);
}
}
+EXPORT_SYMBOL_GPL(cio_tm_start_key);
/**
* cio_tm_intrg - perform interrogate function
@@ -1012,3 +1080,4 @@ int cio_tm_intrg(struct subchannel *sch)
return -ENODEV;
}
}
+EXPORT_SYMBOL_GPL(cio_tm_intrg);
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index f0e57aefb5f2..939596d81b73 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -123,6 +123,7 @@ extern int cio_enable_subchannel(struct subchannel *, u32);
extern int cio_disable_subchannel (struct subchannel *);
extern int cio_cancel (struct subchannel *);
extern int cio_clear (struct subchannel *);
+extern int cio_cancel_halt_clear(struct subchannel *, int *);
extern int cio_resume (struct subchannel *);
extern int cio_halt (struct subchannel *);
extern int cio_start (struct subchannel *, struct ccw1 *, __u8);
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 9afb5ce13007..12016e32e519 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -124,14 +124,6 @@ ccw_device_set_timeout(struct ccw_device *cdev, int expires)
add_timer(&cdev->private->timer);
}
-/*
- * Cancel running i/o. This is called repeatedly since halt/clear are
- * asynchronous operations. We do one try with cio_cancel, two tries
- * with cio_halt, 255 tries with cio_clear. If everythings fails panic.
- * Returns 0 if device now idle, -ENODEV for device not operational and
- * -EBUSY if an interrupt is expected (either from halt/clear or from a
- * status pending).
- */
int
ccw_device_cancel_halt_clear(struct ccw_device *cdev)
{
@@ -139,44 +131,14 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev)
int ret;
sch = to_subchannel(cdev->dev.parent);
- if (cio_update_schib(sch))
- return -ENODEV;
- if (!sch->schib.pmcw.ena)
- /* Not operational -> done. */
- return 0;
- /* Stage 1: cancel io. */
- if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_HALT_PEND) &&
- !(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
- if (!scsw_is_tm(&sch->schib.scsw)) {
- ret = cio_cancel(sch);
- if (ret != -EINVAL)
- return ret;
- }
- /* cancel io unsuccessful or not applicable (transport mode).
- * Continue with asynchronous instructions. */
- cdev->private->iretry = 3; /* 3 halt retries. */
- }
- if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) {
- /* Stage 2: halt io. */
- if (cdev->private->iretry) {
- cdev->private->iretry--;
- ret = cio_halt(sch);
- if (ret != -EBUSY)
- return (ret == 0) ? -EBUSY : ret;
- }
- /* halt io unsuccessful. */
- cdev->private->iretry = 255; /* 255 clear retries. */
- }
- /* Stage 3: clear io. */
- if (cdev->private->iretry) {
- cdev->private->iretry--;
- ret = cio_clear (sch);
- return (ret == 0) ? -EBUSY : ret;
- }
- /* Function was unsuccessful */
- CIO_MSG_EVENT(0, "0.%x.%04x: could not stop I/O\n",
- cdev->private->dev_id.ssid, cdev->private->dev_id.devno);
- return -EIO;
+ ret = cio_cancel_halt_clear(sch, &cdev->private->iretry);
+
+ if (ret == -EIO)
+ CIO_MSG_EVENT(0, "0.%x.%04x: could not stop I/O\n",
+ cdev->private->dev_id.ssid,
+ cdev->private->dev_id.devno);
+
+ return ret;
}
void ccw_device_update_sense_data(struct ccw_device *cdev)
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
new file mode 100644
index 000000000000..ba6ac83a6c25
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -0,0 +1,842 @@
+/*
+ * channel program interfaces
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/vfio.h>
+#include <asm/idals.h>
+
+#include "vfio_ccw_cp.h"
+
+/*
+ * Max length for ccw chain.
+ * XXX: Limit to 256, need to check more?
+ */
+#define CCWCHAIN_LEN_MAX 256
+
+struct pfn_array {
+ unsigned long pa_iova;
+ unsigned long *pa_iova_pfn;
+ unsigned long *pa_pfn;
+ int pa_nr;
+};
+
+struct pfn_array_table {
+ struct pfn_array *pat_pa;
+ int pat_nr;
+};
+
+struct ccwchain {
+ struct list_head next;
+ struct ccw1 *ch_ccw;
+ /* Guest physical address of the current chain. */
+ u64 ch_iova;
+ /* Count of the valid ccws in chain. */
+ int ch_len;
+ /* Pinned PAGEs for the original data. */
+ struct pfn_array_table *ch_pat;
+};
+
+/*
+ * pfn_array_pin() - pin user pages in memory
+ * @pa: pfn_array on which to perform the operation
+ * @mdev: the mediated device to perform pin/unpin operations
+ *
+ * Attempt to pin user pages in memory.
+ *
+ * Usage of pfn_array:
+ * @pa->pa_iova starting guest physical I/O address. Assigned by caller.
+ * @pa->pa_iova_pfn array that stores PFNs of the pages need to pin. Allocated
+ * by caller.
+ * @pa->pa_pfn array that receives PFNs of the pages pinned. Allocated by
+ * caller.
+ * @pa->pa_nr number of pages from @pa->pa_iova to pin. Assigned by
+ * caller.
+ * number of pages pinned. Assigned by callee.
+ *
+ * Returns:
+ * Number of pages pinned on success.
+ * If @pa->pa_nr is 0 or negative, returns 0.
+ * If no pages were pinned, returns -errno.
+ */
+static int pfn_array_pin(struct pfn_array *pa, struct device *mdev)
+{
+ int i, ret;
+
+ if (pa->pa_nr <= 0) {
+ pa->pa_nr = 0;
+ return 0;
+ }
+
+ pa->pa_iova_pfn[0] = pa->pa_iova >> PAGE_SHIFT;
+ for (i = 1; i < pa->pa_nr; i++)
+ pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1;
+
+ ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr,
+ IOMMU_READ | IOMMU_WRITE, pa->pa_pfn);
+
+ if (ret > 0 && ret != pa->pa_nr) {
+ vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret);
+ pa->pa_nr = 0;
+ return 0;
+ }
+
+ return ret;
+}
+
+/* Unpin the pages before releasing the memory. */
+static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev)
+{
+ vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr);
+ pa->pa_nr = 0;
+ kfree(pa->pa_iova_pfn);
+}
+
+/* Alloc memory for PFNs, then pin pages with them. */
+static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev,
+ u64 iova, unsigned int len)
+{
+ int ret = 0;
+
+ if (!len || pa->pa_nr)
+ return -EINVAL;
+
+ pa->pa_iova = iova;
+
+ pa->pa_nr = ((iova & ~PAGE_MASK) + len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ if (!pa->pa_nr)
+ return -EINVAL;
+
+ pa->pa_iova_pfn = kcalloc(pa->pa_nr,
+ sizeof(*pa->pa_iova_pfn) +
+ sizeof(*pa->pa_pfn),
+ GFP_KERNEL);
+ if (unlikely(!pa->pa_iova_pfn))
+ return -ENOMEM;
+ pa->pa_pfn = pa->pa_iova_pfn + pa->pa_nr;
+
+ ret = pfn_array_pin(pa, mdev);
+
+ if (ret > 0)
+ return ret;
+ else if (!ret)
+ ret = -EINVAL;
+
+ kfree(pa->pa_iova_pfn);
+
+ return ret;
+}
+
+static int pfn_array_table_init(struct pfn_array_table *pat, int nr)
+{
+ pat->pat_pa = kcalloc(nr, sizeof(*pat->pat_pa), GFP_KERNEL);
+ if (unlikely(ZERO_OR_NULL_PTR(pat->pat_pa))) {
+ pat->pat_nr = 0;
+ return -ENOMEM;
+ }
+
+ pat->pat_nr = nr;
+
+ return 0;
+}
+
+static void pfn_array_table_unpin_free(struct pfn_array_table *pat,
+ struct device *mdev)
+{
+ int i;
+
+ for (i = 0; i < pat->pat_nr; i++)
+ pfn_array_unpin_free(pat->pat_pa + i, mdev);
+
+ if (pat->pat_nr) {
+ kfree(pat->pat_pa);
+ pat->pat_pa = NULL;
+ pat->pat_nr = 0;
+ }
+}
+
+static bool pfn_array_table_iova_pinned(struct pfn_array_table *pat,
+ unsigned long iova)
+{
+ struct pfn_array *pa = pat->pat_pa;
+ unsigned long iova_pfn = iova >> PAGE_SHIFT;
+ int i, j;
+
+ for (i = 0; i < pat->pat_nr; i++, pa++)
+ for (j = 0; j < pa->pa_nr; j++)
+ if (pa->pa_iova_pfn[i] == iova_pfn)
+ return true;
+
+ return false;
+}
+/* Create the list idal words for a pfn_array_table. */
+static inline void pfn_array_table_idal_create_words(
+ struct pfn_array_table *pat,
+ unsigned long *idaws)
+{
+ struct pfn_array *pa;
+ int i, j, k;
+
+ /*
+ * Idal words (execept the first one) rely on the memory being 4k
+ * aligned. If a user virtual address is 4K aligned, then it's
+ * corresponding kernel physical address will also be 4K aligned. Thus
+ * there will be no problem here to simply use the phys to create an
+ * idaw.
+ */
+ k = 0;
+ for (i = 0; i < pat->pat_nr; i++) {
+ pa = pat->pat_pa + i;
+ for (j = 0; j < pa->pa_nr; j++) {
+ idaws[k] = pa->pa_pfn[j] << PAGE_SHIFT;
+ if (k == 0)
+ idaws[k] += pa->pa_iova & (PAGE_SIZE - 1);
+ k++;
+ }
+ }
+}
+
+
+/*
+ * Within the domain (@mdev), copy @n bytes from a guest physical
+ * address (@iova) to a host physical address (@to).
+ */
+static long copy_from_iova(struct device *mdev,
+ void *to, u64 iova,
+ unsigned long n)
+{
+ struct pfn_array pa = {0};
+ u64 from;
+ int i, ret;
+ unsigned long l, m;
+
+ ret = pfn_array_alloc_pin(&pa, mdev, iova, n);
+ if (ret <= 0)
+ return ret;
+
+ l = n;
+ for (i = 0; i < pa.pa_nr; i++) {
+ from = pa.pa_pfn[i] << PAGE_SHIFT;
+ m = PAGE_SIZE;
+ if (i == 0) {
+ from += iova & (PAGE_SIZE - 1);
+ m -= iova & (PAGE_SIZE - 1);
+ }
+
+ m = min(l, m);
+ memcpy(to + (n - l), (void *)from, m);
+
+ l -= m;
+ if (l == 0)
+ break;
+ }
+
+ pfn_array_unpin_free(&pa, mdev);
+
+ return l;
+}
+
+static long copy_ccw_from_iova(struct channel_program *cp,
+ struct ccw1 *to, u64 iova,
+ unsigned long len)
+{
+ struct ccw0 ccw0;
+ struct ccw1 *pccw1;
+ int ret;
+ int i;
+
+ ret = copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1));
+ if (ret)
+ return ret;
+
+ if (!cp->orb.cmd.fmt) {
+ pccw1 = to;
+ for (i = 0; i < len; i++) {
+ ccw0 = *(struct ccw0 *)pccw1;
+ if ((pccw1->cmd_code & 0x0f) == CCW_CMD_TIC) {
+ pccw1->cmd_code = CCW_CMD_TIC;
+ pccw1->flags = 0;
+ pccw1->count = 0;
+ } else {
+ pccw1->cmd_code = ccw0.cmd_code;
+ pccw1->flags = ccw0.flags;
+ pccw1->count = ccw0.count;
+ }
+ pccw1->cda = ccw0.cda;
+ pccw1++;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Helpers to operate ccwchain.
+ */
+#define ccw_is_test(_ccw) (((_ccw)->cmd_code & 0x0F) == 0)
+
+#define ccw_is_noop(_ccw) ((_ccw)->cmd_code == CCW_CMD_NOOP)
+
+#define ccw_is_tic(_ccw) ((_ccw)->cmd_code == CCW_CMD_TIC)
+
+#define ccw_is_idal(_ccw) ((_ccw)->flags & CCW_FLAG_IDA)
+
+
+#define ccw_is_chain(_ccw) ((_ccw)->flags & (CCW_FLAG_CC | CCW_FLAG_DC))
+
+static struct ccwchain *ccwchain_alloc(struct channel_program *cp, int len)
+{
+ struct ccwchain *chain;
+ void *data;
+ size_t size;
+
+ /* Make ccw address aligned to 8. */
+ size = ((sizeof(*chain) + 7L) & -8L) +
+ sizeof(*chain->ch_ccw) * len +
+ sizeof(*chain->ch_pat) * len;
+ chain = kzalloc(size, GFP_DMA | GFP_KERNEL);
+ if (!chain)
+ return NULL;
+
+ data = (u8 *)chain + ((sizeof(*chain) + 7L) & -8L);
+ chain->ch_ccw = (struct ccw1 *)data;
+
+ data = (u8 *)(chain->ch_ccw) + sizeof(*chain->ch_ccw) * len;
+ chain->ch_pat = (struct pfn_array_table *)data;
+
+ chain->ch_len = len;
+
+ list_add_tail(&chain->next, &cp->ccwchain_list);
+
+ return chain;
+}
+
+static void ccwchain_free(struct ccwchain *chain)
+{
+ list_del(&chain->next);
+ kfree(chain);
+}
+
+/* Free resource for a ccw that allocated memory for its cda. */
+static void ccwchain_cda_free(struct ccwchain *chain, int idx)
+{
+ struct ccw1 *ccw = chain->ch_ccw + idx;
+
+ if (!ccw->count)
+ return;
+
+ kfree((void *)(u64)ccw->cda);
+}
+
+/* Unpin the pages then free the memory resources. */
+static void cp_unpin_free(struct channel_program *cp)
+{
+ struct ccwchain *chain, *temp;
+ int i;
+
+ list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) {
+ for (i = 0; i < chain->ch_len; i++) {
+ pfn_array_table_unpin_free(chain->ch_pat + i,
+ cp->mdev);
+ ccwchain_cda_free(chain, i);
+ }
+ ccwchain_free(chain);
+ }
+}
+
+/**
+ * ccwchain_calc_length - calculate the length of the ccw chain.
+ * @iova: guest physical address of the target ccw chain
+ * @cp: channel_program on which to perform the operation
+ *
+ * This is the chain length not considering any TICs.
+ * You need to do a new round for each TIC target.
+ *
+ * Returns: the length of the ccw chain or -errno.
+ */
+static int ccwchain_calc_length(u64 iova, struct channel_program *cp)
+{
+ struct ccw1 *ccw, *p;
+ int cnt;
+
+ /*
+ * Copy current chain from guest to host kernel.
+ * Currently the chain length is limited to CCWCHAIN_LEN_MAX (256).
+ * So copying 2K is enough (safe).
+ */
+ p = ccw = kcalloc(CCWCHAIN_LEN_MAX, sizeof(*ccw), GFP_KERNEL);
+ if (!ccw)
+ return -ENOMEM;
+
+ cnt = copy_ccw_from_iova(cp, ccw, iova, CCWCHAIN_LEN_MAX);
+ if (cnt) {
+ kfree(ccw);
+ return cnt;
+ }
+
+ cnt = 0;
+ do {
+ cnt++;
+
+ if ((!ccw_is_chain(ccw)) && (!ccw_is_tic(ccw)))
+ break;
+
+ ccw++;
+ } while (cnt < CCWCHAIN_LEN_MAX + 1);
+
+ if (cnt == CCWCHAIN_LEN_MAX + 1)
+ cnt = -EINVAL;
+
+ kfree(p);
+ return cnt;
+}
+
+static int tic_target_chain_exists(struct ccw1 *tic, struct channel_program *cp)
+{
+ struct ccwchain *chain;
+ u32 ccw_head, ccw_tail;
+
+ list_for_each_entry(chain, &cp->ccwchain_list, next) {
+ ccw_head = chain->ch_iova;
+ ccw_tail = ccw_head + (chain->ch_len - 1) * sizeof(struct ccw1);
+
+ if ((ccw_head <= tic->cda) && (tic->cda <= ccw_tail))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int ccwchain_loop_tic(struct ccwchain *chain,
+ struct channel_program *cp);
+
+static int ccwchain_handle_tic(struct ccw1 *tic, struct channel_program *cp)
+{
+ struct ccwchain *chain;
+ int len, ret;
+
+ /* May transfer to an existing chain. */
+ if (tic_target_chain_exists(tic, cp))
+ return 0;
+
+ /* Get chain length. */
+ len = ccwchain_calc_length(tic->cda, cp);
+ if (len < 0)
+ return len;
+
+ /* Need alloc a new chain for this one. */
+ chain = ccwchain_alloc(cp, len);
+ if (!chain)
+ return -ENOMEM;
+ chain->ch_iova = tic->cda;
+
+ /* Copy the new chain from user. */
+ ret = copy_ccw_from_iova(cp, chain->ch_ccw, tic->cda, len);
+ if (ret) {
+ ccwchain_free(chain);
+ return ret;
+ }
+
+ /* Loop for tics on this new chain. */
+ return ccwchain_loop_tic(chain, cp);
+}
+
+/* Loop for TICs. */
+static int ccwchain_loop_tic(struct ccwchain *chain, struct channel_program *cp)
+{
+ struct ccw1 *tic;
+ int i, ret;
+
+ for (i = 0; i < chain->ch_len; i++) {
+ tic = chain->ch_ccw + i;
+
+ if (!ccw_is_tic(tic))
+ continue;
+
+ ret = ccwchain_handle_tic(tic, cp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ccwchain_fetch_tic(struct ccwchain *chain,
+ int idx,
+ struct channel_program *cp)
+{
+ struct ccw1 *ccw = chain->ch_ccw + idx;
+ struct ccwchain *iter;
+ u32 ccw_head, ccw_tail;
+
+ list_for_each_entry(iter, &cp->ccwchain_list, next) {
+ ccw_head = iter->ch_iova;
+ ccw_tail = ccw_head + (iter->ch_len - 1) * sizeof(struct ccw1);
+
+ if ((ccw_head <= ccw->cda) && (ccw->cda <= ccw_tail)) {
+ ccw->cda = (__u32) (addr_t) (iter->ch_ccw +
+ (ccw->cda - ccw_head));
+ return 0;
+ }
+ }
+
+ return -EFAULT;
+}
+
+static int ccwchain_fetch_direct(struct ccwchain *chain,
+ int idx,
+ struct channel_program *cp)
+{
+ struct ccw1 *ccw;
+ struct pfn_array_table *pat;
+ unsigned long *idaws;
+ int idaw_nr;
+
+ ccw = chain->ch_ccw + idx;
+
+ /*
+ * Pin data page(s) in memory.
+ * The number of pages actually is the count of the idaws which will be
+ * needed when translating a direct ccw to a idal ccw.
+ */
+ pat = chain->ch_pat + idx;
+ if (pfn_array_table_init(pat, 1))
+ return -ENOMEM;
+ idaw_nr = pfn_array_alloc_pin(pat->pat_pa, cp->mdev,
+ ccw->cda, ccw->count);
+ if (idaw_nr < 0)
+ return idaw_nr;
+
+ /* Translate this direct ccw to a idal ccw. */
+ idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL);
+ if (!idaws) {
+ pfn_array_table_unpin_free(pat, cp->mdev);
+ return -ENOMEM;
+ }
+ ccw->cda = (__u32) virt_to_phys(idaws);
+ ccw->flags |= CCW_FLAG_IDA;
+
+ pfn_array_table_idal_create_words(pat, idaws);
+
+ return 0;
+}
+
+static int ccwchain_fetch_idal(struct ccwchain *chain,
+ int idx,
+ struct channel_program *cp)
+{
+ struct ccw1 *ccw;
+ struct pfn_array_table *pat;
+ unsigned long *idaws;
+ u64 idaw_iova;
+ unsigned int idaw_nr, idaw_len;
+ int i, ret;
+
+ ccw = chain->ch_ccw + idx;
+
+ /* Calculate size of idaws. */
+ ret = copy_from_iova(cp->mdev, &idaw_iova, ccw->cda, sizeof(idaw_iova));
+ if (ret)
+ return ret;
+ idaw_nr = idal_nr_words((void *)(idaw_iova), ccw->count);
+ idaw_len = idaw_nr * sizeof(*idaws);
+
+ /* Pin data page(s) in memory. */
+ pat = chain->ch_pat + idx;
+ ret = pfn_array_table_init(pat, idaw_nr);
+ if (ret)
+ return ret;
+
+ /* Translate idal ccw to use new allocated idaws. */
+ idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL);
+ if (!idaws) {
+ ret = -ENOMEM;
+ goto out_unpin;
+ }
+
+ ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idaw_len);
+ if (ret)
+ goto out_free_idaws;
+
+ ccw->cda = virt_to_phys(idaws);
+
+ for (i = 0; i < idaw_nr; i++) {
+ idaw_iova = *(idaws + i);
+ if (IS_ERR_VALUE(idaw_iova)) {
+ ret = -EFAULT;
+ goto out_free_idaws;
+ }
+
+ ret = pfn_array_alloc_pin(pat->pat_pa + i, cp->mdev,
+ idaw_iova, 1);
+ if (ret < 0)
+ goto out_free_idaws;
+ }
+
+ pfn_array_table_idal_create_words(pat, idaws);
+
+ return 0;
+
+out_free_idaws:
+ kfree(idaws);
+out_unpin:
+ pfn_array_table_unpin_free(pat, cp->mdev);
+ return ret;
+}
+
+/*
+ * Fetch one ccw.
+ * To reduce memory copy, we'll pin the cda page in memory,
+ * and to get rid of the cda 2G limitiaion of ccw1, we'll translate
+ * direct ccws to idal ccws.
+ */
+static int ccwchain_fetch_one(struct ccwchain *chain,
+ int idx,
+ struct channel_program *cp)
+{
+ struct ccw1 *ccw = chain->ch_ccw + idx;
+
+ if (ccw_is_test(ccw) || ccw_is_noop(ccw))
+ return 0;
+
+ if (ccw_is_tic(ccw))
+ return ccwchain_fetch_tic(chain, idx, cp);
+
+ if (ccw_is_idal(ccw))
+ return ccwchain_fetch_idal(chain, idx, cp);
+
+ return ccwchain_fetch_direct(chain, idx, cp);
+}
+
+/**
+ * cp_init() - allocate ccwchains for a channel program.
+ * @cp: channel_program on which to perform the operation
+ * @mdev: the mediated device to perform pin/unpin operations
+ * @orb: control block for the channel program from the guest
+ *
+ * This creates one or more ccwchain(s), and copies the raw data of
+ * the target channel program from @orb->cmd.iova to the new ccwchain(s).
+ *
+ * Limitations:
+ * 1. Supports only prefetch enabled mode.
+ * 2. Supports idal(c64) ccw chaining.
+ * 3. Supports 4k idaw.
+ *
+ * Returns:
+ * %0 on success and a negative error value on failure.
+ */
+int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb)
+{
+ u64 iova = orb->cmd.cpa;
+ struct ccwchain *chain;
+ int len, ret;
+
+ /*
+ * XXX:
+ * Only support prefetch enable mode now.
+ * Only support 64bit addressing idal.
+ * Only support 4k IDAW.
+ */
+ if (!orb->cmd.pfch || !orb->cmd.c64 || orb->cmd.i2k)
+ return -EOPNOTSUPP;
+
+ INIT_LIST_HEAD(&cp->ccwchain_list);
+ memcpy(&cp->orb, orb, sizeof(*orb));
+ cp->mdev = mdev;
+
+ /* Get chain length. */
+ len = ccwchain_calc_length(iova, cp);
+ if (len < 0)
+ return len;
+
+ /* Alloc mem for the head chain. */
+ chain = ccwchain_alloc(cp, len);
+ if (!chain)
+ return -ENOMEM;
+ chain->ch_iova = iova;
+
+ /* Copy the head chain from guest. */
+ ret = copy_ccw_from_iova(cp, chain->ch_ccw, iova, len);
+ if (ret) {
+ ccwchain_free(chain);
+ return ret;
+ }
+
+ /* Now loop for its TICs. */
+ ret = ccwchain_loop_tic(chain, cp);
+ if (ret)
+ cp_unpin_free(cp);
+
+ return ret;
+}
+
+
+/**
+ * cp_free() - free resources for channel program.
+ * @cp: channel_program on which to perform the operation
+ *
+ * This unpins the memory pages and frees the memory space occupied by
+ * @cp, which must have been returned by a previous call to cp_init().
+ * Otherwise, undefined behavior occurs.
+ */
+void cp_free(struct channel_program *cp)
+{
+ cp_unpin_free(cp);
+}
+
+/**
+ * cp_prefetch() - translate a guest physical address channel program to
+ * a real-device runnable channel program.
+ * @cp: channel_program on which to perform the operation
+ *
+ * This function translates the guest-physical-address channel program
+ * and stores the result to ccwchain list. @cp must have been
+ * initialized by a previous call with cp_init(). Otherwise, undefined
+ * behavior occurs.
+ *
+ * The S/390 CCW Translation APIS (prefixed by 'cp_') are introduced
+ * as helpers to do ccw chain translation inside the kernel. Basically
+ * they accept a channel program issued by a virtual machine, and
+ * translate the channel program to a real-device runnable channel
+ * program.
+ *
+ * These APIs will copy the ccws into kernel-space buffers, and update
+ * the guest phsical addresses with their corresponding host physical
+ * addresses. Then channel I/O device drivers could issue the
+ * translated channel program to real devices to perform an I/O
+ * operation.
+ *
+ * These interfaces are designed to support translation only for
+ * channel programs, which are generated and formatted by a
+ * guest. Thus this will make it possible for things like VFIO to
+ * leverage the interfaces to passthrough a channel I/O mediated
+ * device in QEMU.
+ *
+ * We support direct ccw chaining by translating them to idal ccws.
+ *
+ * Returns:
+ * %0 on success and a negative error value on failure.
+ */
+int cp_prefetch(struct channel_program *cp)
+{
+ struct ccwchain *chain;
+ int len, idx, ret;
+
+ list_for_each_entry(chain, &cp->ccwchain_list, next) {
+ len = chain->ch_len;
+ for (idx = 0; idx < len; idx++) {
+ ret = ccwchain_fetch_one(chain, idx, cp);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * cp_get_orb() - get the orb of the channel program
+ * @cp: channel_program on which to perform the operation
+ * @intparm: new intparm for the returned orb
+ * @lpm: candidate value of the logical-path mask for the returned orb
+ *
+ * This function returns the address of the updated orb of the channel
+ * program. Channel I/O device drivers could use this orb to issue a
+ * ssch.
+ */
+union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm)
+{
+ union orb *orb;
+ struct ccwchain *chain;
+ struct ccw1 *cpa;
+
+ orb = &cp->orb;
+
+ orb->cmd.intparm = intparm;
+ orb->cmd.fmt = 1;
+ orb->cmd.key = PAGE_DEFAULT_KEY >> 4;
+
+ if (orb->cmd.lpm == 0)
+ orb->cmd.lpm = lpm;
+
+ chain = list_first_entry(&cp->ccwchain_list, struct ccwchain, next);
+ cpa = chain->ch_ccw;
+ orb->cmd.cpa = (__u32) __pa(cpa);
+
+ return orb;
+}
+
+/**
+ * cp_update_scsw() - update scsw for a channel program.
+ * @cp: channel_program on which to perform the operation
+ * @scsw: I/O results of the channel program and also the target to be
+ * updated
+ *
+ * @scsw contains the I/O results of the channel program that pointed
+ * to by @cp. However what @scsw->cpa stores is a host physical
+ * address, which is meaningless for the guest, which is waiting for
+ * the I/O results.
+ *
+ * This function updates @scsw->cpa to its coressponding guest physical
+ * address.
+ */
+void cp_update_scsw(struct channel_program *cp, union scsw *scsw)
+{
+ struct ccwchain *chain;
+ u32 cpa = scsw->cmd.cpa;
+ u32 ccw_head, ccw_tail;
+
+ /*
+ * LATER:
+ * For now, only update the cmd.cpa part. We may need to deal with
+ * other portions of the schib as well, even if we don't return them
+ * in the ioctl directly. Path status changes etc.
+ */
+ list_for_each_entry(chain, &cp->ccwchain_list, next) {
+ ccw_head = (u32)(u64)chain->ch_ccw;
+ ccw_tail = (u32)(u64)(chain->ch_ccw + chain->ch_len - 1);
+
+ if ((ccw_head <= cpa) && (cpa <= ccw_tail)) {
+ /*
+ * (cpa - ccw_head) is the offset value of the host
+ * physical ccw to its chain head.
+ * Adding this value to the guest physical ccw chain
+ * head gets us the guest cpa.
+ */
+ cpa = chain->ch_iova + (cpa - ccw_head);
+ break;
+ }
+ }
+
+ scsw->cmd.cpa = cpa;
+}
+
+/**
+ * cp_iova_pinned() - check if an iova is pinned for a ccw chain.
+ * @cmd: ccwchain command on which to perform the operation
+ * @iova: the iova to check
+ *
+ * If the @iova is currently pinned for the ccw chain, return true;
+ * else return false.
+ */
+bool cp_iova_pinned(struct channel_program *cp, u64 iova)
+{
+ struct ccwchain *chain;
+ int i;
+
+ list_for_each_entry(chain, &cp->ccwchain_list, next) {
+ for (i = 0; i < chain->ch_len; i++)
+ if (pfn_array_table_iova_pinned(chain->ch_pat + i,
+ iova))
+ return true;
+ }
+
+ return false;
+}
diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h
new file mode 100644
index 000000000000..7a1996b3b36d
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_cp.h
@@ -0,0 +1,42 @@
+/*
+ * channel program interfaces
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#ifndef _VFIO_CCW_CP_H_
+#define _VFIO_CCW_CP_H_
+
+#include <asm/cio.h>
+#include <asm/scsw.h>
+
+#include "orb.h"
+
+/**
+ * struct channel_program - manage information for channel program
+ * @ccwchain_list: list head of ccwchains
+ * @orb: orb for the currently processed ssch request
+ * @mdev: the mediated device to perform page pinning/unpinning
+ *
+ * @ccwchain_list is the head of a ccwchain list, that contents the
+ * translated result of the guest channel program that pointed out by
+ * the iova parameter when calling cp_init.
+ */
+struct channel_program {
+ struct list_head ccwchain_list;
+ union orb orb;
+ struct device *mdev;
+};
+
+extern int cp_init(struct channel_program *cp, struct device *mdev,
+ union orb *orb);
+extern void cp_free(struct channel_program *cp);
+extern int cp_prefetch(struct channel_program *cp);
+extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm);
+extern void cp_update_scsw(struct channel_program *cp, union scsw *scsw);
+extern bool cp_iova_pinned(struct channel_program *cp, u64 iova);
+
+#endif
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
new file mode 100644
index 000000000000..e90dd43d2a55
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -0,0 +1,308 @@
+/*
+ * VFIO based Physical Subchannel device driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
+
+#include <asm/isc.h>
+
+#include "ioasm.h"
+#include "css.h"
+#include "vfio_ccw_private.h"
+
+struct workqueue_struct *vfio_ccw_work_q;
+
+/*
+ * Helpers
+ */
+int vfio_ccw_sch_quiesce(struct subchannel *sch)
+{
+ struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ DECLARE_COMPLETION_ONSTACK(completion);
+ int iretry, ret = 0;
+
+ spin_lock_irq(sch->lock);
+ if (!sch->schib.pmcw.ena)
+ goto out_unlock;
+ ret = cio_disable_subchannel(sch);
+ if (ret != -EBUSY)
+ goto out_unlock;
+
+ do {
+ iretry = 255;
+
+ ret = cio_cancel_halt_clear(sch, &iretry);
+ while (ret == -EBUSY) {
+ /*
+ * Flush all I/O and wait for
+ * cancel/halt/clear completion.
+ */
+ private->completion = &completion;
+ spin_unlock_irq(sch->lock);
+
+ wait_for_completion_timeout(&completion, 3*HZ);
+
+ spin_lock_irq(sch->lock);
+ private->completion = NULL;
+ flush_workqueue(vfio_ccw_work_q);
+ ret = cio_cancel_halt_clear(sch, &iretry);
+ };
+
+ ret = cio_disable_subchannel(sch);
+ } while (ret == -EBUSY);
+out_unlock:
+ private->state = VFIO_CCW_STATE_NOT_OPER;
+ spin_unlock_irq(sch->lock);
+ return ret;
+}
+
+static void vfio_ccw_sch_io_todo(struct work_struct *work)
+{
+ struct vfio_ccw_private *private;
+ struct subchannel *sch;
+ struct irb *irb;
+
+ private = container_of(work, struct vfio_ccw_private, io_work);
+ irb = &private->irb;
+ sch = private->sch;
+
+ if (scsw_is_solicited(&irb->scsw)) {
+ cp_update_scsw(&private->cp, &irb->scsw);
+ cp_free(&private->cp);
+ }
+ memcpy(private->io_region.irb_area, irb, sizeof(*irb));
+
+ if (private->io_trigger)
+ eventfd_signal(private->io_trigger, 1);
+
+ if (private->mdev)
+ private->state = VFIO_CCW_STATE_IDLE;
+}
+
+/*
+ * Sysfs interfaces
+ */
+static ssize_t chpids_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct subchannel *sch = to_subchannel(dev);
+ struct chsc_ssd_info *ssd = &sch->ssd_info;
+ ssize_t ret = 0;
+ int chp;
+ int mask;
+
+ for (chp = 0; chp < 8; chp++) {
+ mask = 0x80 >> chp;
+ if (ssd->path_mask & mask)
+ ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id);
+ else
+ ret += sprintf(buf + ret, "00 ");
+ }
+ ret += sprintf(buf+ret, "\n");
+ return ret;
+}
+
+static ssize_t pimpampom_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct subchannel *sch = to_subchannel(dev);
+ struct pmcw *pmcw = &sch->schib.pmcw;
+
+ return sprintf(buf, "%02x %02x %02x\n",
+ pmcw->pim, pmcw->pam, pmcw->pom);
+}
+
+static DEVICE_ATTR(chpids, 0444, chpids_show, NULL);
+static DEVICE_ATTR(pimpampom, 0444, pimpampom_show, NULL);
+
+static struct attribute *vfio_subchannel_attrs[] = {
+ &dev_attr_chpids.attr,
+ &dev_attr_pimpampom.attr,
+ NULL,
+};
+
+static struct attribute_group vfio_subchannel_attr_group = {
+ .attrs = vfio_subchannel_attrs,
+};
+
+/*
+ * Css driver callbacks
+ */
+static void vfio_ccw_sch_irq(struct subchannel *sch)
+{
+ struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+
+ inc_irq_stat(IRQIO_CIO);
+ vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT);
+}
+
+static int vfio_ccw_sch_probe(struct subchannel *sch)
+{
+ struct pmcw *pmcw = &sch->schib.pmcw;
+ struct vfio_ccw_private *private;
+ int ret;
+
+ if (pmcw->qf) {
+ dev_warn(&sch->dev, "vfio: ccw: does not support QDIO: %s\n",
+ dev_name(&sch->dev));
+ return -ENODEV;
+ }
+
+ private = kzalloc(sizeof(*private), GFP_KERNEL | GFP_DMA);
+ if (!private)
+ return -ENOMEM;
+ private->sch = sch;
+ dev_set_drvdata(&sch->dev, private);
+
+ spin_lock_irq(sch->lock);
+ private->state = VFIO_CCW_STATE_NOT_OPER;
+ sch->isc = VFIO_CCW_ISC;
+ ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+ spin_unlock_irq(sch->lock);
+ if (ret)
+ goto out_free;
+
+ ret = sysfs_create_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
+ if (ret)
+ goto out_disable;
+
+ ret = vfio_ccw_mdev_reg(sch);
+ if (ret)
+ goto out_rm_group;
+
+ INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo);
+ atomic_set(&private->avail, 1);
+ private->state = VFIO_CCW_STATE_STANDBY;
+
+ return 0;
+
+out_rm_group:
+ sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
+out_disable:
+ cio_disable_subchannel(sch);
+out_free:
+ dev_set_drvdata(&sch->dev, NULL);
+ kfree(private);
+ return ret;
+}
+
+static int vfio_ccw_sch_remove(struct subchannel *sch)
+{
+ struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+
+ vfio_ccw_sch_quiesce(sch);
+
+ vfio_ccw_mdev_unreg(sch);
+
+ sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group);
+
+ dev_set_drvdata(&sch->dev, NULL);
+
+ kfree(private);
+
+ return 0;
+}
+
+static void vfio_ccw_sch_shutdown(struct subchannel *sch)
+{
+ vfio_ccw_sch_quiesce(sch);
+}
+
+/**
+ * vfio_ccw_sch_event - process subchannel event
+ * @sch: subchannel
+ * @process: non-zero if function is called in process context
+ *
+ * An unspecified event occurred for this subchannel. Adjust data according
+ * to the current operational state of the subchannel. Return zero when the
+ * event has been handled sufficiently or -EAGAIN when this function should
+ * be called again in process context.
+ */
+static int vfio_ccw_sch_event(struct subchannel *sch, int process)
+{
+ struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ unsigned long flags;
+
+ spin_lock_irqsave(sch->lock, flags);
+ if (!device_is_registered(&sch->dev))
+ goto out_unlock;
+
+ if (work_pending(&sch->todo_work))
+ goto out_unlock;
+
+ if (cio_update_schib(sch)) {
+ vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER);
+ goto out_unlock;
+ }
+
+ private = dev_get_drvdata(&sch->dev);
+ if (private->state == VFIO_CCW_STATE_NOT_OPER) {
+ private->state = private->mdev ? VFIO_CCW_STATE_IDLE :
+ VFIO_CCW_STATE_STANDBY;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(sch->lock, flags);
+
+ return 0;
+}
+
+static struct css_device_id vfio_ccw_sch_ids[] = {
+ { .match_flags = 0x1, .type = SUBCHANNEL_TYPE_IO, },
+ { /* end of list */ },
+};
+MODULE_DEVICE_TABLE(css, vfio_ccw_sch_ids);
+
+static struct css_driver vfio_ccw_sch_driver = {
+ .drv = {
+ .name = "vfio_ccw",
+ .owner = THIS_MODULE,
+ },
+ .subchannel_type = vfio_ccw_sch_ids,
+ .irq = vfio_ccw_sch_irq,
+ .probe = vfio_ccw_sch_probe,
+ .remove = vfio_ccw_sch_remove,
+ .shutdown = vfio_ccw_sch_shutdown,
+ .sch_event = vfio_ccw_sch_event,
+};
+
+static int __init vfio_ccw_sch_init(void)
+{
+ int ret;
+
+ vfio_ccw_work_q = create_singlethread_workqueue("vfio-ccw");
+ if (!vfio_ccw_work_q)
+ return -ENOMEM;
+
+ isc_register(VFIO_CCW_ISC);
+ ret = css_driver_register(&vfio_ccw_sch_driver);
+ if (ret) {
+ isc_unregister(VFIO_CCW_ISC);
+ destroy_workqueue(vfio_ccw_work_q);
+ }
+
+ return ret;
+}
+
+static void __exit vfio_ccw_sch_exit(void)
+{
+ css_driver_unregister(&vfio_ccw_sch_driver);
+ isc_unregister(VFIO_CCW_ISC);
+ destroy_workqueue(vfio_ccw_work_q);
+}
+module_init(vfio_ccw_sch_init);
+module_exit(vfio_ccw_sch_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
new file mode 100644
index 000000000000..55b6cc55758e
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -0,0 +1,207 @@
+/*
+ * Finite state machine for vfio-ccw device handling
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ */
+
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+
+#include "ioasm.h"
+#include "vfio_ccw_private.h"
+
+static int fsm_io_helper(struct vfio_ccw_private *private)
+{
+ struct subchannel *sch;
+ union orb *orb;
+ int ccode;
+ __u8 lpm;
+ unsigned long flags;
+
+ sch = private->sch;
+
+ spin_lock_irqsave(sch->lock, flags);
+ private->state = VFIO_CCW_STATE_BUSY;
+ spin_unlock_irqrestore(sch->lock, flags);
+
+ orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm);
+
+ /* Issue "Start Subchannel" */
+ ccode = ssch(sch->schid, orb);
+
+ switch (ccode) {
+ case 0:
+ /*
+ * Initialize device status information
+ */
+ sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND;
+ return 0;
+ case 1: /* Status pending */
+ case 2: /* Busy */
+ return -EBUSY;
+ case 3: /* Device/path not operational */
+ {
+ lpm = orb->cmd.lpm;
+ if (lpm != 0)
+ sch->lpm &= ~lpm;
+ else
+ sch->lpm = 0;
+
+ if (cio_update_schib(sch))
+ return -ENODEV;
+
+ return sch->lpm ? -EACCES : -ENODEV;
+ }
+ default:
+ return ccode;
+ }
+}
+
+static void fsm_notoper(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+ struct subchannel *sch = private->sch;
+
+ /*
+ * TODO:
+ * Probably we should send the machine check to the guest.
+ */
+ css_sched_sch_todo(sch, SCH_TODO_UNREG);
+ private->state = VFIO_CCW_STATE_NOT_OPER;
+}
+
+/*
+ * No operation action.
+ */
+static void fsm_nop(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+}
+
+static void fsm_io_error(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+ pr_err("vfio-ccw: FSM: I/O request from state:%d\n", private->state);
+ private->io_region.ret_code = -EIO;
+}
+
+static void fsm_io_busy(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+ private->io_region.ret_code = -EBUSY;
+}
+
+static void fsm_disabled_irq(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+ struct subchannel *sch = private->sch;
+
+ /*
+ * An interrupt in a disabled state means a previous disable was not
+ * successful - should not happen, but we try to disable again.
+ */
+ cio_disable_subchannel(sch);
+}
+
+/*
+ * Deal with the ccw command request from the userspace.
+ */
+static void fsm_io_request(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+ union orb *orb;
+ union scsw *scsw = &private->scsw;
+ struct ccw_io_region *io_region = &private->io_region;
+ struct mdev_device *mdev = private->mdev;
+
+ private->state = VFIO_CCW_STATE_BOXED;
+
+ memcpy(scsw, io_region->scsw_area, sizeof(*scsw));
+
+ if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) {
+ orb = (union orb *)io_region->orb_area;
+
+ io_region->ret_code = cp_init(&private->cp, mdev_dev(mdev),
+ orb);
+ if (io_region->ret_code)
+ goto err_out;
+
+ io_region->ret_code = cp_prefetch(&private->cp);
+ if (io_region->ret_code) {
+ cp_free(&private->cp);
+ goto err_out;
+ }
+
+ /* Start channel program and wait for I/O interrupt. */
+ io_region->ret_code = fsm_io_helper(private);
+ if (io_region->ret_code) {
+ cp_free(&private->cp);
+ goto err_out;
+ }
+ return;
+ } else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
+ /* XXX: Handle halt. */
+ io_region->ret_code = -EOPNOTSUPP;
+ goto err_out;
+ } else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
+ /* XXX: Handle clear. */
+ io_region->ret_code = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+err_out:
+ private->state = VFIO_CCW_STATE_IDLE;
+}
+
+/*
+ * Got an interrupt for a normal io (state busy).
+ */
+static void fsm_irq(struct vfio_ccw_private *private,
+ enum vfio_ccw_event event)
+{
+ struct irb *irb;
+
+ if (!private)
+ return;
+
+ irb = this_cpu_ptr(&cio_irb);
+ memcpy(&private->irb, irb, sizeof(*irb));
+
+ queue_work(vfio_ccw_work_q, &private->io_work);
+
+ if (private->completion)
+ complete(private->completion);
+}
+
+/*
+ * Device statemachine
+ */
+fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS] = {
+ [VFIO_CCW_STATE_NOT_OPER] = {
+ [VFIO_CCW_EVENT_NOT_OPER] = fsm_nop,
+ [VFIO_CCW_EVENT_IO_REQ] = fsm_io_error,
+ [VFIO_CCW_EVENT_INTERRUPT] = fsm_disabled_irq,
+ },
+ [VFIO_CCW_STATE_STANDBY] = {
+ [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper,
+ [VFIO_CCW_EVENT_IO_REQ] = fsm_io_error,
+ [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq,
+ },
+ [VFIO_CCW_STATE_IDLE] = {
+ [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper,
+ [VFIO_CCW_EVENT_IO_REQ] = fsm_io_request,
+ [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq,
+ },
+ [VFIO_CCW_STATE_BOXED] = {
+ [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper,
+ [VFIO_CCW_EVENT_IO_REQ] = fsm_io_busy,
+ [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq,
+ },
+ [VFIO_CCW_STATE_BUSY] = {
+ [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper,
+ [VFIO_CCW_EVENT_IO_REQ] = fsm_io_busy,
+ [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq,
+ },
+};
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
new file mode 100644
index 000000000000..b2e615404034
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -0,0 +1,447 @@
+/*
+ * Physical device callbacks for vfio_ccw
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+
+#include "vfio_ccw_private.h"
+
+static int vfio_ccw_mdev_reset(struct mdev_device *mdev)
+{
+ struct vfio_ccw_private *private;
+ struct subchannel *sch;
+ int ret;
+
+ private = dev_get_drvdata(mdev_parent_dev(mdev));
+ if (!private)
+ return -ENODEV;
+
+ sch = private->sch;
+ /*
+ * TODO:
+ * In the cureent stage, some things like "no I/O running" and "no
+ * interrupt pending" are clear, but we are not sure what other state
+ * we need to care about.
+ * There are still a lot more instructions need to be handled. We
+ * should come back here later.
+ */
+ ret = vfio_ccw_sch_quiesce(sch);
+ if (ret)
+ return ret;
+
+ ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch);
+ if (!ret)
+ private->state = VFIO_CCW_STATE_IDLE;
+
+ return ret;
+}
+
+static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ struct vfio_ccw_private *private =
+ container_of(nb, struct vfio_ccw_private, nb);
+
+ if (!private)
+ return NOTIFY_STOP;
+
+ /*
+ * Vendor drivers MUST unpin pages in response to an
+ * invalidation.
+ */
+ if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
+ struct vfio_iommu_type1_dma_unmap *unmap = data;
+
+ if (!cp_iova_pinned(&private->cp, unmap->iova))
+ return NOTIFY_OK;
+
+ if (vfio_ccw_mdev_reset(private->mdev))
+ return NOTIFY_BAD;
+
+ cp_free(&private->cp);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+ return sprintf(buf, "I/O subchannel (Non-QDIO)\n");
+}
+MDEV_TYPE_ATTR_RO(name);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", VFIO_DEVICE_API_CCW_STRING);
+}
+MDEV_TYPE_ATTR_RO(device_api);
+
+static ssize_t available_instances_show(struct kobject *kobj,
+ struct device *dev, char *buf)
+{
+ struct vfio_ccw_private *private = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%d\n", atomic_read(&private->avail));
+}
+MDEV_TYPE_ATTR_RO(available_instances);
+
+static struct attribute *mdev_types_attrs[] = {
+ &mdev_type_attr_name.attr,
+ &mdev_type_attr_device_api.attr,
+ &mdev_type_attr_available_instances.attr,
+ NULL,
+};
+
+static struct attribute_group mdev_type_group = {
+ .name = "io",
+ .attrs = mdev_types_attrs,
+};
+
+struct attribute_group *mdev_type_groups[] = {
+ &mdev_type_group,
+ NULL,
+};
+
+static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+ struct vfio_ccw_private *private =
+ dev_get_drvdata(mdev_parent_dev(mdev));
+
+ if (private->state == VFIO_CCW_STATE_NOT_OPER)
+ return -ENODEV;
+
+ if (atomic_dec_if_positive(&private->avail) < 0)
+ return -EPERM;
+
+ private->mdev = mdev;
+ private->state = VFIO_CCW_STATE_IDLE;
+
+ return 0;
+}
+
+static int vfio_ccw_mdev_remove(struct mdev_device *mdev)
+{
+ struct vfio_ccw_private *private =
+ dev_get_drvdata(mdev_parent_dev(mdev));
+ int ret;
+
+ if (!private)
+ goto out;
+
+ if ((private->state == VFIO_CCW_STATE_NOT_OPER) ||
+ (private->state == VFIO_CCW_STATE_STANDBY))
+ goto out;
+
+ ret = vfio_ccw_mdev_reset(mdev);
+ if (ret)
+ return ret;
+
+ private->state = VFIO_CCW_STATE_STANDBY;
+
+out:
+ private->mdev = NULL;
+ atomic_inc(&private->avail);
+
+ return 0;
+}
+
+static int vfio_ccw_mdev_open(struct mdev_device *mdev)
+{
+ struct vfio_ccw_private *private =
+ dev_get_drvdata(mdev_parent_dev(mdev));
+ unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+
+ private->nb.notifier_call = vfio_ccw_mdev_notifier;
+
+ return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+ &events, &private->nb);
+}
+
+void vfio_ccw_mdev_release(struct mdev_device *mdev)
+{
+ struct vfio_ccw_private *private =
+ dev_get_drvdata(mdev_parent_dev(mdev));
+
+ vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
+ &private->nb);
+}
+
+static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev,
+ char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct vfio_ccw_private *private;
+ struct ccw_io_region *region;
+
+ if (*ppos + count > sizeof(*region))
+ return -EINVAL;
+
+ private = dev_get_drvdata(mdev_parent_dev(mdev));
+ if (!private)
+ return -ENODEV;
+
+ region = &private->io_region;
+ if (copy_to_user(buf, (void *)region + *ppos, count))
+ return -EFAULT;
+
+ return count;
+}
+
+static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct vfio_ccw_private *private;
+ struct ccw_io_region *region;
+
+ if (*ppos + count > sizeof(*region))
+ return -EINVAL;
+
+ private = dev_get_drvdata(mdev_parent_dev(mdev));
+ if (!private)
+ return -ENODEV;
+ if (private->state != VFIO_CCW_STATE_IDLE)
+ return -EACCES;
+
+ region = &private->io_region;
+ if (copy_from_user((void *)region + *ppos, buf, count))
+ return -EFAULT;
+
+ vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ);
+ if (region->ret_code != 0) {
+ private->state = VFIO_CCW_STATE_IDLE;
+ return region->ret_code;
+ }
+
+ return count;
+}
+
+static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info)
+{
+ info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET;
+ info->num_regions = VFIO_CCW_NUM_REGIONS;
+ info->num_irqs = VFIO_CCW_NUM_IRQS;
+
+ return 0;
+}
+
+static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info,
+ u16 *cap_type_id,
+ void **cap_type)
+{
+ switch (info->index) {
+ case VFIO_CCW_CONFIG_REGION_INDEX:
+ info->offset = 0;
+ info->size = sizeof(struct ccw_io_region);
+ info->flags = VFIO_REGION_INFO_FLAG_READ
+ | VFIO_REGION_INFO_FLAG_WRITE;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info)
+{
+ if (info->index != VFIO_CCW_IO_IRQ_INDEX)
+ return -EINVAL;
+
+ info->count = 1;
+ info->flags = VFIO_IRQ_INFO_EVENTFD;
+
+ return 0;
+}
+
+static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev,
+ uint32_t flags,
+ void __user *data)
+{
+ struct vfio_ccw_private *private;
+ struct eventfd_ctx **ctx;
+
+ if (!(flags & VFIO_IRQ_SET_ACTION_TRIGGER))
+ return -EINVAL;
+
+ private = dev_get_drvdata(mdev_parent_dev(mdev));
+ if (!private)
+ return -ENODEV;
+
+ ctx = &private->io_trigger;
+
+ switch (flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+ case VFIO_IRQ_SET_DATA_NONE:
+ {
+ if (*ctx)
+ eventfd_signal(*ctx, 1);
+ return 0;
+ }
+ case VFIO_IRQ_SET_DATA_BOOL:
+ {
+ uint8_t trigger;
+
+ if (get_user(trigger, (uint8_t __user *)data))
+ return -EFAULT;
+
+ if (trigger && *ctx)
+ eventfd_signal(*ctx, 1);
+ return 0;
+ }
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ {
+ int32_t fd;
+
+ if (get_user(fd, (int32_t __user *)data))
+ return -EFAULT;
+
+ if (fd == -1) {
+ if (*ctx)
+ eventfd_ctx_put(*ctx);
+ *ctx = NULL;
+ } else if (fd >= 0) {
+ struct eventfd_ctx *efdctx;
+
+ efdctx = eventfd_ctx_fdget(fd);
+ if (IS_ERR(efdctx))
+ return PTR_ERR(efdctx);
+
+ if (*ctx)
+ eventfd_ctx_put(*ctx);
+
+ *ctx = efdctx;
+ } else
+ return -EINVAL;
+
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = vfio_ccw_mdev_get_device_info(&info);
+ if (ret)
+ return ret;
+
+ return copy_to_user((void __user *)arg, &info, minsz);
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+ u16 cap_type_id = 0;
+ void *cap_type = NULL;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = vfio_ccw_mdev_get_region_info(&info, &cap_type_id,
+ &cap_type);
+ if (ret)
+ return ret;
+
+ return copy_to_user((void __user *)arg, &info, minsz);
+ }
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz || info.index >= VFIO_CCW_NUM_IRQS)
+ return -EINVAL;
+
+ ret = vfio_ccw_mdev_get_irq_info(&info);
+ if (ret)
+ return ret;
+
+ if (info.count == -1)
+ return -EINVAL;
+
+ return copy_to_user((void __user *)arg, &info, minsz);
+ }
+ case VFIO_DEVICE_SET_IRQS:
+ {
+ struct vfio_irq_set hdr;
+ size_t data_size;
+ void __user *data;
+
+ minsz = offsetofend(struct vfio_irq_set, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ ret = vfio_set_irqs_validate_and_prepare(&hdr, 1,
+ VFIO_CCW_NUM_IRQS,
+ &data_size);
+ if (ret)
+ return ret;
+
+ data = (void __user *)(arg + minsz);
+ return vfio_ccw_mdev_set_irqs(mdev, hdr.flags, data);
+ }
+ case VFIO_DEVICE_RESET:
+ return vfio_ccw_mdev_reset(mdev);
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
+ .owner = THIS_MODULE,
+ .supported_type_groups = mdev_type_groups,
+ .create = vfio_ccw_mdev_create,
+ .remove = vfio_ccw_mdev_remove,
+ .open = vfio_ccw_mdev_open,
+ .release = vfio_ccw_mdev_release,
+ .read = vfio_ccw_mdev_read,
+ .write = vfio_ccw_mdev_write,
+ .ioctl = vfio_ccw_mdev_ioctl,
+};
+
+int vfio_ccw_mdev_reg(struct subchannel *sch)
+{
+ return mdev_register_device(&sch->dev, &vfio_ccw_mdev_ops);
+}
+
+void vfio_ccw_mdev_unreg(struct subchannel *sch)
+{
+ mdev_unregister_device(&sch->dev);
+}
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
new file mode 100644
index 000000000000..fc0f01c16ef9
--- /dev/null
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -0,0 +1,96 @@
+/*
+ * Private stuff for vfio_ccw driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com>
+ */
+
+#ifndef _VFIO_CCW_PRIVATE_H_
+#define _VFIO_CCW_PRIVATE_H_
+
+#include <linux/completion.h>
+#include <linux/eventfd.h>
+#include <linux/workqueue.h>
+#include <linux/vfio_ccw.h>
+
+#include "css.h"
+#include "vfio_ccw_cp.h"
+
+/**
+ * struct vfio_ccw_private
+ * @sch: pointer to the subchannel
+ * @state: internal state of the device
+ * @completion: synchronization helper of the I/O completion
+ * @avail: available for creating a mediated device
+ * @mdev: pointer to the mediated device
+ * @nb: notifier for vfio events
+ * @io_region: MMIO region to input/output I/O arguments/results
+ * @cp: channel program for the current I/O operation
+ * @irb: irb info received from interrupt
+ * @scsw: scsw info
+ * @io_trigger: eventfd ctx for signaling userspace I/O results
+ * @io_work: work for deferral process of I/O handling
+ */
+struct vfio_ccw_private {
+ struct subchannel *sch;
+ int state;
+ struct completion *completion;
+ atomic_t avail;
+ struct mdev_device *mdev;
+ struct notifier_block nb;
+ struct ccw_io_region io_region;
+
+ struct channel_program cp;
+ struct irb irb;
+ union scsw scsw;
+
+ struct eventfd_ctx *io_trigger;
+ struct work_struct io_work;
+} __aligned(8);
+
+extern int vfio_ccw_mdev_reg(struct subchannel *sch);
+extern void vfio_ccw_mdev_unreg(struct subchannel *sch);
+
+extern int vfio_ccw_sch_quiesce(struct subchannel *sch);
+
+/*
+ * States of the device statemachine.
+ */
+enum vfio_ccw_state {
+ VFIO_CCW_STATE_NOT_OPER,
+ VFIO_CCW_STATE_STANDBY,
+ VFIO_CCW_STATE_IDLE,
+ VFIO_CCW_STATE_BOXED,
+ VFIO_CCW_STATE_BUSY,
+ /* last element! */
+ NR_VFIO_CCW_STATES
+};
+
+/*
+ * Asynchronous events of the device statemachine.
+ */
+enum vfio_ccw_event {
+ VFIO_CCW_EVENT_NOT_OPER,
+ VFIO_CCW_EVENT_IO_REQ,
+ VFIO_CCW_EVENT_INTERRUPT,
+ /* last element! */
+ NR_VFIO_CCW_EVENTS
+};
+
+/*
+ * Action called through jumptable.
+ */
+typedef void (fsm_func_t)(struct vfio_ccw_private *, enum vfio_ccw_event);
+extern fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS];
+
+static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private,
+ int event)
+{
+ vfio_ccw_jumptable[private->state][event](private, event);
+}
+
+extern struct workqueue_struct *vfio_ccw_work_q;
+
+#endif
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 519eff362c1c..ae461050661a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -198,6 +198,7 @@ struct vfio_device_info {
#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
#define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */
__u32 num_regions; /* Max region index + 1 */
__u32 num_irqs; /* Max IRQ index + 1 */
};
@@ -212,6 +213,7 @@ struct vfio_device_info {
#define VFIO_DEVICE_API_PCI_STRING "vfio-pci"
#define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform"
#define VFIO_DEVICE_API_AMBA_STRING "vfio-amba"
+#define VFIO_DEVICE_API_CCW_STRING "vfio-ccw"
/**
* VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
@@ -446,6 +448,22 @@ enum {
VFIO_PCI_NUM_IRQS
};
+/*
+ * The vfio-ccw bus driver makes use of the following fixed region and
+ * IRQ index mapping. Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
+ */
+
+enum {
+ VFIO_CCW_CONFIG_REGION_INDEX,
+ VFIO_CCW_NUM_REGIONS
+};
+
+enum {
+ VFIO_CCW_IO_IRQ_INDEX,
+ VFIO_CCW_NUM_IRQS
+};
+
/**
* VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
* struct vfio_pci_hot_reset_info)
diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h
new file mode 100644
index 000000000000..34a7f6f9e065
--- /dev/null
+++ b/include/uapi/linux/vfio_ccw.h
@@ -0,0 +1,24 @@
+/*
+ * Interfaces for vfio-ccw
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+ */
+
+#ifndef _VFIO_CCW_H_
+#define _VFIO_CCW_H_
+
+#include <linux/types.h>
+
+struct ccw_io_region {
+#define ORB_AREA_SIZE 12
+ __u8 orb_area[ORB_AREA_SIZE];
+#define SCSW_AREA_SIZE 12
+ __u8 scsw_area[SCSW_AREA_SIZE];
+#define IRB_AREA_SIZE 96
+ __u8 irb_area[IRB_AREA_SIZE];
+ __u32 ret_code;
+} __packed;
+
+#endif