diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2018-10-04 18:12:45 +0300 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2018-10-04 18:12:45 +0300 |
commit | dd5bd0a65ff6f22a32b35ca3fa1bcf7a6bc7104f (patch) | |
tree | 7214fc9fb55a1cedee4d0bca3a4f6799414236e7 | |
parent | 7e7126846c95a34f98a1524d5c473af1f0783735 (diff) | |
parent | 55d09dd4c86060fbbc74ab2b1bfaed401cd0163a (diff) | |
download | linux-dd5bd0a65ff6f22a32b35ca3fa1bcf7a6bc7104f.tar.xz |
Merge tag 'kvm-s390-next-4.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD
KVM: s390: Features for 4.20
- Initial version of AP crypto virtualization via vfio-mdev
- Set the host program identifier
- Optimize page table locking
-rw-r--r-- | Documentation/s390/vfio-ap.txt | 837 | ||||
-rw-r--r-- | MAINTAINERS | 12 | ||||
-rw-r--r-- | arch/s390/Kconfig | 11 | ||||
-rw-r--r-- | arch/s390/include/asm/kvm_host.h | 13 | ||||
-rw-r--r-- | arch/s390/include/uapi/asm/kvm.h | 2 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 135 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.h | 1 | ||||
-rw-r--r-- | arch/s390/kvm/vsie.c | 210 | ||||
-rw-r--r-- | arch/s390/mm/gmap.c | 10 | ||||
-rw-r--r-- | arch/s390/tools/gen_facilities.c | 2 | ||||
-rw-r--r-- | drivers/iommu/Kconfig | 8 | ||||
-rw-r--r-- | drivers/s390/crypto/Makefile | 4 | ||||
-rw-r--r-- | drivers/s390/crypto/vfio_ap_drv.c | 157 | ||||
-rw-r--r-- | drivers/s390/crypto/vfio_ap_ops.c | 968 | ||||
-rw-r--r-- | drivers/s390/crypto/vfio_ap_private.h | 88 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 2 |
16 files changed, 2397 insertions, 63 deletions
diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.txt new file mode 100644 index 000000000000..65167cfe4485 --- /dev/null +++ b/Documentation/s390/vfio-ap.txt @@ -0,0 +1,837 @@ +Introduction: +============ +The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised +of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. +The AP devices provide cryptographic functions to all CPUs assigned to a +linux system running in an IBM Z system LPAR. + +The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap +is to make AP cards available to KVM guests using the VFIO mediated device +framework. This implementation relies considerably on the s390 virtualization +facilities which do most of the hard work of providing direct access to AP +devices. + +AP Architectural Overview: +========================= +To facilitate the comprehension of the design, let's start with some +definitions: + +* AP adapter + + An AP adapter is an IBM Z adapter card that can perform cryptographic + functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters + assigned to the LPAR in which a linux host is running will be available to + the linux host. Each adapter is identified by a number from 0 to 255; however, + the maximum adapter number is determined by machine model and/or adapter type. + When installed, an AP adapter is accessed by AP instructions executed by any + CPU. + + The AP adapter cards are assigned to a given LPAR via the system's Activation + Profile which can be edited via the HMC. When the linux host system is IPL'd + in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and + creates a sysfs device for each assigned adapter. For example, if AP adapters + 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following + sysfs device entries: + + /sys/devices/ap/card04 + /sys/devices/ap/card0a + + Symbolic links to these devices will also be created in the AP bus devices + sub-directory: + + /sys/bus/ap/devices/[card04] + /sys/bus/ap/devices/[card04] + +* AP domain + + An adapter is partitioned into domains. An adapter can hold up to 256 domains + depending upon the adapter type and hardware configuration. A domain is + identified by a number from 0 to 255; however, the maximum domain number is + determined by machine model and/or adapter type.. A domain can be thought of + as a set of hardware registers and memory used for processing AP commands. A + domain can be configured with a secure private key used for clear key + encryption. A domain is classified in one of two ways depending upon how it + may be accessed: + + * Usage domains are domains that are targeted by an AP instruction to + process an AP command. + + * Control domains are domains that are changed by an AP command sent to a + usage domain; for example, to set the secure private key for the control + domain. + + The AP usage and control domains are assigned to a given LPAR via the system's + Activation Profile which can be edited via the HMC. When a linux host system + is IPL'd in the LPAR, the AP bus module detects the AP usage and control + domains assigned to the LPAR. The domain number of each usage domain and + adapter number of each AP adapter are combined to create AP queue devices + (see AP Queue section below). The domain number of each control domain will be + represented in a bitmask and stored in a sysfs file + /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least + significant bit, correspond to domains 0-255. + +* AP Queue + + An AP queue is the means by which an AP command is sent to a usage domain + inside a specific adapter. An AP queue is identified by a tuple + comprised of an AP adapter ID (APID) and an AP queue index (APQI). The + APQI corresponds to a given usage domain number within the adapter. This tuple + forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP + instructions include a field containing the APQN to identify the AP queue to + which the AP command is to be sent for processing. + + The AP bus will create a sysfs device for each APQN that can be derived from + the cross product of the AP adapter and usage domain numbers detected when the + AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage + domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the + following sysfs entries: + + /sys/devices/ap/card04/04.0006 + /sys/devices/ap/card04/04.0047 + /sys/devices/ap/card0a/0a.0006 + /sys/devices/ap/card0a/0a.0047 + + The following symbolic links to these devices will be created in the AP bus + devices subdirectory: + + /sys/bus/ap/devices/[04.0006] + /sys/bus/ap/devices/[04.0047] + /sys/bus/ap/devices/[0a.0006] + /sys/bus/ap/devices/[0a.0047] + +* AP Instructions: + + There are three AP instructions: + + * NQAP: to enqueue an AP command-request message to a queue + * DQAP: to dequeue an AP command-reply message from a queue + * PQAP: to administer the queues + + AP instructions identify the domain that is targeted to process the AP + command; this must be one of the usage domains. An AP command may modify a + domain that is not one of the usage domains, but the modified domain + must be one of the control domains. + +AP and SIE: +========== +Let's now take a look at how AP instructions executed on a guest are interpreted +by the hardware. + +A satellite control block called the Crypto Control Block (CRYCB) is attached to +our main hardware virtualization control block. The CRYCB contains three fields +to identify the adapters, usage domains and control domains assigned to the KVM +guest: + +* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned + to the KVM guest. Each bit in the mask, from left to right (i.e. from most + significant to least significant bit in big endian order), corresponds to + an APID from 0-255. If a bit is set, the corresponding adapter is valid for + use by the KVM guest. + +* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains + assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from + most significant to least significant bit in big endian order), corresponds to + an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue + is valid for use by the KVM guest. + +* The AP Domain Mask field is a bit mask that identifies the AP control domains + assigned to the KVM guest. The ADM bit mask controls which domains can be + changed by an AP command-request message sent to a usage domain from the + guest. Each bit in the mask, from left to right (i.e. from most significant to + least significant bit in big endian order), corresponds to a domain from + 0-255. If a bit is set, the corresponding domain can be modified by an AP + command-request message sent to a usage domain. + +If you recall from the description of an AP Queue, AP instructions include +an APQN to identify the AP queue to which an AP command-request message is to be +sent (NQAP and PQAP instructions), or from which a command-reply message is to +be received (DQAP instruction). The validity of an APQN is defined by the matrix +calculated from the APM and AQM; it is the cross product of all assigned adapter +numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1 +and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6), +(2,5) and (2,6) will be valid for the guest. + +The APQNs can provide secure key functionality - i.e., a private key is stored +on the adapter card for each of its domains - so each APQN must be assigned to +at most one guest or to the linux host. + + Example 1: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1,2 domain 7 + + This is valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (1,7), (2,7) + + Example 2: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapters 3,4 domains 5,6 + + This is also valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (3,5), (3,6), (4,5), (4,6) + + Example 3: Invalid configuration: + -------------------------------- + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1 domains 6,7 + + This is an invalid configuration because both guests have access to + APQN (1,6). + +The Design: +=========== +The design introduces three new objects: + +1. AP matrix device +2. VFIO AP device driver (vfio_ap.ko) +3. VFIO AP mediated matrix pass-through device + +The VFIO AP device driver +------------------------- +The VFIO AP (vfio_ap) device driver serves the following purposes: + +1. Provides the interfaces to secure APQNs for exclusive use of KVM guests. + +2. Sets up the VFIO mediated device interfaces to manage a mediated matrix + device and creates the sysfs interfaces for assigning adapters, usage + domains, and control domains comprising the matrix for a KVM guest. + +3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's + SIE state description to grant the guest access to a matrix of AP devices + +Reserve APQNs for exclusive use of KVM guests +--------------------------------------------- +The following block diagram illustrates the mechanism by which APQNs are +reserved: + + +------------------+ + 7 remove | | + +--------------------> cex4queue driver | + | | | + | +------------------+ + | + | + | +------------------+ +-----------------+ + | 5 register driver | | 3 create | | + | +----------------> Device core +----------> matrix device | + | | | | | | + | | +--------^---------+ +-----------------+ + | | | + | | +-------------------+ + | | +-----------------------------------+ | + | | | 4 register AP driver | | 2 register device + | | | | | ++--------+---+-v---+ +--------+-------+-+ +| | | | +| ap_bus +--------------------- > vfio_ap driver | +| | 8 probe | | ++--------^---------+ +--^--^------------+ +6 edit | | | + apmask | +-----------------------------+ | 9 mdev create + aqmask | | 1 modprobe | ++--------+-----+---+ +----------------+-+ +------------------+ +| | | |8 create | mediated | +| admin | | VFIO device core |---------> matrix | +| + | | | device | ++------+-+---------+ +--------^---------+ +--------^---------+ + | | | | + | | 9 create vfio_ap-passthrough | | + | +------------------------------+ | + +-------------------------------------------------------------+ + 10 assign adapter/domain/control domain + +The process for reserving an AP queue for use by a KVM guest is: + +1. The administrator loads the vfio_ap device driver +2. The vfio-ap driver during its initialization will register a single 'matrix' + device with the device core. This will serve as the parent device for + all mediated matrix devices used to configure an AP matrix for a guest. +3. The /sys/devices/vfio_ap/matrix device is created by the device core +4 The vfio_ap device driver will register with the AP bus for AP queue devices + of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap + driver's probe and remove callback interfaces. Devices older than CEX4 queues + are not supported to simplify the implementation by not needlessly + complicating the design by supporting older devices that will go out of + service in the relatively near future, and for which there are few older + systems around on which to test. +5. The AP bus registers the vfio_ap device driver with the device core +6. The administrator edits the AP adapter and queue masks to reserve AP queues + for use by the vfio_ap device driver. +7. The AP bus removes the AP queues reserved for the vfio_ap driver from the + default zcrypt cex4queue driver. +8. The AP bus probes the vfio_ap device driver to bind the queues reserved for + it. +9. The administrator creates a passthrough type mediated matrix device to be + used by a guest +10 The administrator assigns the adapters, usage domains and control domains + to be exclusively used by a guest. + +Set up the VFIO mediated device interfaces +------------------------------------------ +The VFIO AP device driver utilizes the common interface of the VFIO mediated +device core driver to: +* Register an AP mediated bus driver to add a mediated matrix device to and + remove it from a VFIO group. +* Create and destroy a mediated matrix device +* Add a mediated matrix device to and remove it from the AP mediated bus driver +* Add a mediated matrix device to and remove it from an IOMMU group + +The following high-level block diagram shows the main components and interfaces +of the VFIO AP mediated matrix device driver: + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_device() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ap.ko |<-> matrix + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +During initialization of the vfio_ap module, the matrix device is registered +with an 'mdev_parent_ops' structure that provides the sysfs attribute +structures, mdev functions and callback interfaces for managing the mediated +matrix device. + +* sysfs attribute structures: + * supported_type_groups + The VFIO mediated device framework supports creation of user-defined + mediated device types. These mediated device types are specified + via the 'supported_type_groups' structure when a device is registered + with the mediated device framework. The registration process creates the + sysfs structures for each mediated device type specified in the + 'mdev_supported_types' sub-directory of the device being registered. Along + with the device type, the sysfs attributes of the mediated device type are + provided. + + The VFIO AP device driver will register one mediated device type for + passthrough devices: + /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough + Only the read-only attributes required by the VFIO mdev framework will + be provided: + ... name + ... device_api + ... available_instances + ... device_api + Where: + * name: specifies the name of the mediated device type + * device_api: the mediated device type's API + * available_instances: the number of mediated matrix passthrough devices + that can be created + * device_api: specifies the VFIO API + * mdev_attr_groups + This attribute group identifies the user-defined sysfs attributes of the + mediated device. When a device is registered with the VFIO mediated device + framework, the sysfs attribute files identified in the 'mdev_attr_groups' + structure will be created in the mediated matrix device's directory. The + sysfs attributes for a mediated matrix device are: + * assign_adapter: + * unassign_adapter: + Write-only attributes for assigning/unassigning an AP adapter to/from the + mediated matrix device. To assign/unassign an adapter, the APID of the + adapter is echoed to the respective attribute file. + * assign_domain: + * unassign_domain: + Write-only attributes for assigning/unassigning an AP usage domain to/from + the mediated matrix device. To assign/unassign a domain, the domain + number of the the usage domain is echoed to the respective attribute + file. + * matrix: + A read-only file for displaying the APQNs derived from the cross product + of the adapter and domain numbers assigned to the mediated matrix device. + * assign_control_domain: + * unassign_control_domain: + Write-only attributes for assigning/unassigning an AP control domain + to/from the mediated matrix device. To assign/unassign a control domain, + the ID of the domain to be assigned/unassigned is echoed to the respective + attribute file. + * control_domains: + A read-only file for displaying the control domain numbers assigned to the + mediated matrix device. + +* functions: + * create: + allocates the ap_matrix_mdev structure used by the vfio_ap driver to: + * Store the reference to the KVM structure for the guest using the mdev + * Store the AP matrix configuration for the adapters, domains, and control + domains assigned via the corresponding sysfs attributes files + * remove: + deallocates the mediated matrix device's ap_matrix_mdev structure. This will + be allowed only if a running guest is not using the mdev. + +* callback interfaces + * open: + The vfio_ap driver uses this callback to register a + VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix + device. The open is invoked when QEMU connects the VFIO iommu group + for the mdev matrix device to the MDEV bus. Access to the KVM structure used + to configure the KVM guest is provided via this callback. The KVM structure, + is used to configure the guest's access to the AP matrix defined via the + mediated matrix device's sysfs attribute files. + * release: + unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the + mdev matrix device and deconfigures the guest's AP matrix. + +Configure the APM, AQM and ADM in the CRYCB: +------------------------------------------- +Configuring the AP matrix for a KVM guest will be performed when the +VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier +function is called when QEMU connects to KVM. The guest's AP matrix is +configured via it's CRYCB by: +* Setting the bits in the APM corresponding to the APIDs assigned to the + mediated matrix device via its 'assign_adapter' interface. +* Setting the bits in the AQM corresponding to the domains assigned to the + mediated matrix device via its 'assign_domain' interface. +* Setting the bits in the ADM corresponding to the domain dIDs assigned to the + mediated matrix device via its 'assign_control_domains' interface. + +The CPU model features for AP +----------------------------- +The AP stack relies on the presence of the AP instructions as well as two +facilities: The AP Facilities Test (APFT) facility; and the AP Query +Configuration Information (QCI) facility. These features/facilities are made +available to a KVM guest via the following CPU model features: + +1. ap: Indicates whether the AP instructions are installed on the guest. This + feature will be enabled by KVM only if the AP instructions are installed + on the host. + +2. apft: Indicates the APFT facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 15 is set). + +3. apqci: Indicates the AP QCI facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 12 is set). + +Note: If the user chooses to specify a CPU model different than the 'host' +model to QEMU, the CPU model features and facilities need to be turned on +explicitly; for example: + + /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on + +A guest can be precluded from using AP features/facilities by turning them off +explicitly; for example: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off + +Note: If the APFT facility is turned off (apft=off) for the guest, the guest +will not see any AP devices. The zcrypt device drivers that register for type 10 +and newer AP devices - i.e., the cex4card and cex4queue device drivers - need +the APFT facility to ascertain the facilities installed on a given AP device. If +the APFT facility is not installed on the guest, then the probe of device +drivers will fail since only type 10 and newer devices can be configured for +guest use. + +Example: +======= +Let's now provide an example to illustrate how KVM guests may be given +access to AP facilities. For this example, we will show how to configure +three guests such that executing the lszcrypt command on the guests would +look like this: + +Guest1 +------ +CARD.DOMAIN TYPE MODE +------------------------------ +05 CEX5C CCA-Coproc +05.0004 CEX5C CCA-Coproc +05.00ab CEX5C CCA-Coproc +06 CEX5A Accelerator +06.0004 CEX5A Accelerator +06.00ab CEX5C CCA-Coproc + +Guest2 +------ +CARD.DOMAIN TYPE MODE +------------------------------ +05 CEX5A Accelerator +05.0047 CEX5A Accelerator +05.00ff CEX5A Accelerator + +Guest2 +------ +CARD.DOMAIN TYPE MODE +------------------------------ +06 CEX5A Accelerator +06.0047 CEX5A Accelerator +06.00ff CEX5A Accelerator + +These are the steps: + +1. Install the vfio_ap module on the linux host. The dependency chain for the + vfio_ap module is: + * iommu + * s390 + * zcrypt + * vfio + * vfio_mdev + * vfio_mdev_device + * KVM + + To build the vfio_ap module, the kernel build must be configured with the + following Kconfig elements selected: + * IOMMU_SUPPORT + * S390 + * ZCRYPT + * S390_AP_IOMMU + * VFIO + * VFIO_MDEV + * VFIO_MDEV_DEVICE + * KVM + + If using make menuconfig select the following to build the vfio_ap module: + -> Device Drivers + -> IOMMU Hardware Support + select S390 AP IOMMU Support + -> VFIO Non-Privileged userspace driver framework + -> Mediated device driver frramework + -> VFIO driver for Mediated devices + -> I/O subsystem + -> VFIO support for AP devices + +2. Secure the AP queues to be used by the three guests so that the host can not + access them. To secure them, there are two sysfs files that specify + bitmasks marking a subset of the APQN range as 'usable by the default AP + queue device drivers' or 'not usable by the default device drivers' and thus + available for use by the vfio_ap device driver'. The location of the sysfs + files containing the masks are: + + /sys/bus/ap/apmask + /sys/bus/ap/aqmask + + The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs + (APID). Each bit in the mask, from left to right (i.e., from most significant + to least significant bit in big endian order), corresponds to an APID from + 0-255. If a bit is set, the APID is marked as usable only by the default AP + queue device drivers; otherwise, the APID is usable by the vfio_ap + device driver. + + The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes + (APQI). Each bit in the mask, from left to right (i.e., from most significant + to least significant bit in big endian order), corresponds to an APQI from + 0-255. If a bit is set, the APQI is marked as usable only by the default AP + queue device drivers; otherwise, the APQI is usable by the vfio_ap device + driver. + + Take, for example, the following mask: + + 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + + It indicates: + + 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6 + belong to the vfio_ap device driver's pool. + + The APQN of each AP queue device assigned to the linux host is checked by the + AP bus against the set of APQNs derived from the cross product of APIDs + and APQIs marked as usable only by the default AP queue device drivers. If a + match is detected, only the default AP queue device drivers will be probed; + otherwise, the vfio_ap device driver will be probed. + + By default, the two masks are set to reserve all APQNs for use by the default + AP queue device drivers. There are two ways the default masks can be changed: + + 1. The sysfs mask files can be edited by echoing a string into the + respective sysfs mask file in one of two formats: + + * An absolute hex string starting with 0x - like "0x12345678" - sets + the mask. If the given string is shorter than the mask, it is padded + with 0s on the right; for example, specifying a mask value of 0x41 is + the same as specifying: + + 0x4100000000000000000000000000000000000000000000000000000000000000 + + Keep in mind that the mask reads from left to right (i.e., most + significant to least significant bit in big endian order), so the mask + above identifies device numbers 1 and 7 (01000001). + + If the string is longer than the mask, the operation is terminated with + an error (EINVAL). + + * Individual bits in the mask can be switched on and off by specifying + each bit number to be switched in a comma separated list. Each bit + number string must be prepended with a ('+') or minus ('-') to indicate + the corresponding bit is to be switched on ('+') or off ('-'). Some + valid values are: + + "+0" switches bit 0 on + "-13" switches bit 13 off + "+0x41" switches bit 65 on + "-0xff" switches bit 255 off + + The following example: + +0,-6,+0x47,-0xf0 + + Switches bits 0 and 71 (0x47) on + Switches bits 6 and 240 (0xf0) off + + Note that the bits not specified in the list remain as they were before + the operation. + + 2. The masks can also be changed at boot time via parameters on the kernel + command line like this: + + ap.apmask=0xffff ap.aqmask=0x40 + + This would create the following masks: + + apmask: + 0xffff000000000000000000000000000000000000000000000000000000000000 + + aqmask: + 0x4000000000000000000000000000000000000000000000000000000000000000 + + Resulting in these two pools: + + default drivers pool: adapter 0-15, domain 1 + alternate drivers pool: adapter 16-255, domains 0, 2-255 + + Securing the APQNs for our example: + ---------------------------------- + To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, + 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding + APQNs can either be removed from the default masks: + + echo -5,-6 > /sys/bus/ap/apmask + + echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask + + Or the masks can be set as follows: + + echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ + > apmask + + echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \ + > aqmask + + This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, + 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The + sysfs directory for the vfio_ap device driver will now contain symbolic links + to the AP queue devices bound to it: + + /sys/bus/ap + ... [drivers] + ...... [vfio_ap] + ......... [05.0004] + ......... [05.0047] + ......... [05.00ab] + ......... [05.00ff] + ......... [06.0004] + ......... [06.0047] + ......... [06.00ab] + ......... [06.00ff] + + Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) + can be bound to the vfio_ap device driver. The reason for this is to + simplify the implementation by not needlessly complicating the design by + supporting older devices that will go out of service in the relatively near + future and for which there are few older systems on which to test. + + The administrator, therefore, must take care to secure only AP queues that + can be bound to the vfio_ap device driver. The device type for a given AP + queue device can be read from the parent card's sysfs directory. For example, + to see the hardware type of the queue 05.0004: + + cat /sys/bus/ap/devices/card05/hwtype + + The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the + vfio_ap device driver. + +3. Create the mediated devices needed to configure the AP matrixes for the + three guests and to provide an interface to the vfio_ap driver for + use by the guests: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] (passthrough mediated matrix device type) + --------- create + --------- [devices] + + To create the mediated devices for the three guests: + + uuidgen > create + uuidgen > create + uuidgen > create + + or + + echo $uuid1 > create + echo $uuid2 > create + echo $uuid3 > create + + This will create three mediated devices in the [devices] subdirectory named + after the UUID written to the create attribute file. We call them $uuid1, + $uuid2 and $uuid3 and this is the sysfs directory structure after creation: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + --------------- unassign_control_domain + --------------- unassign_domain + + ------------ [$uuid2] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + + ------------ [$uuid3] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + +4. The administrator now needs to configure the matrixes for the mediated + devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). + + This is how the matrix is configured for Guest1: + + echo 5 > assign_adapter + echo 6 > assign_adapter + echo 4 > assign_domain + echo 0xab > assign_domain + + Control domains can similarly be assigned using the assign_control_domain + sysfs file. + + If a mistake is made configuring an adapter, domain or control domain, + you can use the unassign_xxx files to unassign the adapter, domain or + control domain. + + To display the matrix configuration for Guest1: + + cat matrix + + This is how the matrix is configured for Guest2: + + echo 5 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + This is how the matrix is configured for Guest3: + + echo 6 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + In order to successfully assign an adapter: + + * The adapter number specified must represent a value from 0 up to the + maximum adapter number configured for the system. If an adapter number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + * All APQNs that can be derived from the adapter ID and the IDs of + the previously assigned domains must be bound to the vfio_ap device + driver. If no domains have yet been assigned, then there must be at least + one APQN with the specified APID bound to the vfio_ap driver. If no such + APQNs are bound to the driver, the operation will terminate with an + error (EADDRNOTAVAIL). + + No APQN that can be derived from the adapter ID and the IDs of the + previously assigned domains can be assigned to another mediated matrix + device. If an APQN is assigned to another mediated matrix device, the + operation will terminate with an error (EADDRINUSE). + + In order to successfully assign a domain: + + * The domain number specified must represent a value from 0 up to the + maximum domain number configured for the system. If a domain number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + * All APQNs that can be derived from the domain ID and the IDs of + the previously assigned adapters must be bound to the vfio_ap device + driver. If no domains have yet been assigned, then there must be at least + one APQN with the specified APQI bound to the vfio_ap driver. If no such + APQNs are bound to the driver, the operation will terminate with an + error (EADDRNOTAVAIL). + + No APQN that can be derived from the domain ID and the IDs of the + previously assigned adapters can be assigned to another mediated matrix + device. If an APQN is assigned to another mediated matrix device, the + operation will terminate with an error (EADDRINUSE). + + In order to successfully assign a control domain, the domain number + specified must represent a value from 0 up to the maximum domain number + configured for the system. If a control domain number higher than the maximum + is specified, the operation will terminate with an error (ENODEV). + +5. Start Guest1: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... + +7. Start Guest2: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... + +7. Start Guest3: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... + +When the guest is shut down, the mediated matrix devices may be removed. + +Using our example again, to remove the mediated matrix device $uuid1: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- remove + + + echo 1 > remove + + This will remove all of the mdev matrix device's sysfs structures including + the mdev device itself. To recreate and reconfigure the mdev matrix device, + all of the steps starting with step 3 will have to be performed again. Note + that the remove will fail if a guest using the mdev is still running. + + It is not necessary to remove an mdev matrix device, but one may want to + remove it if no guest will use it during the remaining lifetime of the linux + host. If the mdev matrix device is removed, one may want to also reconfigure + the pool of adapters and queues reserved for use by the default drivers. + +Limitations +=========== +* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN + to the default drivers pool of a queue that is still assigned to a mediated + device in use by a guest. It is incumbent upon the administrator to + ensure there is no mediated device in use by a guest to which the APQN is + assigned lest the host be given access to the private data of the AP queue + device such as a private key configured specifically for the guest. + +* Dynamically modifying the AP matrix for a running guest (which would amount to + hot(un)plug of AP devices for the guest) is currently not supported + +* Live guest migration is not supported for guests using AP devices. diff --git a/MAINTAINERS b/MAINTAINERS index e993064637ca..1610fb26bdac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12668,6 +12668,18 @@ W: http://www.ibm.com/developerworks/linux/linux390/ S: Supported F: drivers/s390/crypto/ +S390 VFIO AP DRIVER +M: Tony Krowiak <akrowiak@linux.ibm.com> +M: Pierre Morel <pmorel@linux.ibm.com> +M: Halil Pasic <pasic@linux.ibm.com> +L: linux-s390@vger.kernel.org +W: http://www.ibm.com/developerworks/linux/linux390/ +S: Supported +F: drivers/s390/crypto/vfio_ap_drv.c +F: drivers/s390/crypto/vfio_ap_private.h +F: drivers/s390/crypto/vfio_ap_ops.c +F: Documentation/s390/vfio-ap.txt + S390 ZFCP DRIVER M: Steffen Maier <maier@linux.ibm.com> M: Benjamin Block <bblock@linux.ibm.com> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9a9c7a6fe925..8cc8f25d9576 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -773,6 +773,17 @@ config VFIO_CCW To compile this driver as a module, choose M here: the module will be called vfio_ccw. +config VFIO_AP + def_tristate n + prompt "VFIO support for AP devices" + depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM + help + This driver grants access to Adjunct Processor (AP) devices + via the VFIO mediated device interface. + + To compile this driver as a module, choose M here: the module + will be called vfio_ap. + endmenu menu "Dump support" diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 29c940bf8506..febd1709472a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -44,6 +44,7 @@ #define KVM_REQ_ICPT_OPEREXC KVM_ARCH_REQ(2) #define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3) #define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4) +#define KVM_REQ_VSIE_RESTART KVM_ARCH_REQ(5) #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -186,6 +187,7 @@ struct kvm_s390_sie_block { #define ECA_AIV 0x00200000 #define ECA_VX 0x00020000 #define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 #define ECA_SII 0x00000001 __u32 eca; /* 0x004c */ #define ICPT_INST 0x04 @@ -237,7 +239,11 @@ struct kvm_s390_sie_block { psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ - __u8 reservedb0[20]; /* 0x00b0 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[11]; /* 0x00b9 */ __u16 extcpuaddr; /* 0x00c4 */ __u16 eic; /* 0x00c6 */ __u32 reservedc8; /* 0x00c8 */ @@ -255,6 +261,8 @@ struct kvm_s390_sie_block { __u8 reservede4[4]; /* 0x00e4 */ __u64 tecmc; /* 0x00e8 */ __u8 reservedf0[12]; /* 0x00f0 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 #define CRYCB_FORMAT1 0x00000001 #define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ @@ -715,6 +723,7 @@ struct kvm_s390_crypto { __u32 crycbd; __u8 aes_kw; __u8 dea_kw; + __u8 apie; }; #define APCB0_MASK_SIZE 1 @@ -855,6 +864,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +void kvm_arch_crypto_clear_masks(struct kvm *kvm); + extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 9a50f02b9894..16511d97e8dc 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +#define KVM_S390_VM_CRYPTO_ENABLE_APIE 4 +#define KVM_S390_VM_CRYPTO_DISABLE_APIE 5 /* kvm attributes for migration mode */ #define KVM_S390_VM_MIGRATION_STOP 0 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ac5da6b0b862..b7964d77c6cd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -40,6 +40,7 @@ #include <asm/sclp.h> #include <asm/cpacf.h> #include <asm/timex.h> +#include <asm/ap.h> #include "kvm-s390.h" #include "gaccess.h" @@ -844,20 +845,22 @@ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm) kvm_s390_vcpu_block_all(kvm); - kvm_for_each_vcpu(i, vcpu, kvm) + kvm_for_each_vcpu(i, vcpu, kvm) { kvm_s390_vcpu_crypto_setup(vcpu); + /* recreate the shadow crycb by leaving the VSIE handler */ + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } kvm_s390_vcpu_unblock_all(kvm); } static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) { - if (!test_kvm_facility(kvm, 76)) - return -EINVAL; - mutex_lock(&kvm->lock); switch (attr->attr) { case KVM_S390_VM_CRYPTO_ENABLE_AES_KW: + if (!test_kvm_facility(kvm, 76)) + return -EINVAL; get_random_bytes( kvm->arch.crypto.crycb->aes_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); @@ -865,6 +868,8 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: + if (!test_kvm_facility(kvm, 76)) + return -EINVAL; get_random_bytes( kvm->arch.crypto.crycb->dea_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); @@ -872,17 +877,35 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: + if (!test_kvm_facility(kvm, 76)) + return -EINVAL; kvm->arch.crypto.aes_kw = 0; memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: + if (!test_kvm_facility(kvm, 76)) + return -EINVAL; kvm->arch.crypto.dea_kw = 0; memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support"); break; + case KVM_S390_VM_CRYPTO_ENABLE_APIE: + if (!ap_instructions_available()) { + mutex_unlock(&kvm->lock); + return -EOPNOTSUPP; + } + kvm->arch.crypto.apie = 1; + break; + case KVM_S390_VM_CRYPTO_DISABLE_APIE: + if (!ap_instructions_available()) { + mutex_unlock(&kvm->lock); + return -EOPNOTSUPP; + } + kvm->arch.crypto.apie = 0; + break; default: mutex_unlock(&kvm->lock); return -ENXIO; @@ -1491,6 +1514,10 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: ret = 0; break; + case KVM_S390_VM_CRYPTO_ENABLE_APIE: + case KVM_S390_VM_CRYPTO_DISABLE_APIE: + ret = ap_instructions_available() ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -1992,55 +2019,60 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -static int kvm_s390_query_ap_config(u8 *config) -{ - u32 fcn_code = 0x04000000UL; - u32 cc = 0; - - memset(config, 0, 128); - asm volatile( - "lgr 0,%1\n" - "lgr 2,%2\n" - ".long 0xb2af0000\n" /* PQAP(QCI) */ - "0: ipm %0\n" - "srl %0,28\n" - "1:\n" - EX_TABLE(0b, 1b) - : "+r" (cc) - : "r" (fcn_code), "r" (config) - : "cc", "0", "2", "memory" - ); - - return cc; -} - static int kvm_s390_apxa_installed(void) { - u8 config[128]; - int cc; - - if (test_facility(12)) { - cc = kvm_s390_query_ap_config(config); + struct ap_config_info info; - if (cc) - pr_err("PQAP(QCI) failed with cc=%d", cc); - else - return config[0] & 0x40; + if (ap_instructions_available()) { + if (ap_qci(&info) == 0) + return info.apxa; } return 0; } +/* + * The format of the crypto control block (CRYCB) is specified in the 3 low + * order bits of the CRYCB designation (CRYCBD) field as follows: + * Format 0: Neither the message security assist extension 3 (MSAX3) nor the + * AP extended addressing (APXA) facility are installed. + * Format 1: The APXA facility is not installed but the MSAX3 facility is. + * Format 2: Both the APXA and MSAX3 facilities are installed + */ static void kvm_s390_set_crycb_format(struct kvm *kvm) { kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb; + /* Clear the CRYCB format bits - i.e., set format 0 by default */ + kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK); + + /* Check whether MSAX3 is installed */ + if (!test_kvm_facility(kvm, 76)) + return; + if (kvm_s390_apxa_installed()) kvm->arch.crypto.crycbd |= CRYCB_FORMAT2; else kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; } +void kvm_arch_crypto_clear_masks(struct kvm *kvm) +{ + mutex_lock(&kvm->lock); + kvm_s390_vcpu_block_all(kvm); + + memset(&kvm->arch.crypto.crycb->apcb0, 0, + sizeof(kvm->arch.crypto.crycb->apcb0)); + memset(&kvm->arch.crypto.crycb->apcb1, 0, + sizeof(kvm->arch.crypto.crycb->apcb1)); + + /* recreate the shadow crycb for each vcpu */ + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); + kvm_s390_vcpu_unblock_all(kvm); + mutex_unlock(&kvm->lock); +} +EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); + static u64 kvm_s390_get_initial_cpuid(void) { struct cpuid cpuid; @@ -2052,12 +2084,12 @@ static u64 kvm_s390_get_initial_cpuid(void) static void kvm_s390_crypto_init(struct kvm *kvm) { - if (!test_kvm_facility(kvm, 76)) - return; - kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; kvm_s390_set_crycb_format(kvm); + if (!test_kvm_facility(kvm, 76)) + return; + /* Enable AES/DEA protected key functions by default */ kvm->arch.crypto.aes_kw = 1; kvm->arch.crypto.dea_kw = 1; @@ -2583,17 +2615,25 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) { - if (!test_kvm_facility(vcpu->kvm, 76)) + /* + * If the AP instructions are not being interpreted and the MSAX3 + * facility is not configured for the guest, there is nothing to set up. + */ + if (!vcpu->kvm->arch.crypto.apie && !test_kvm_facility(vcpu->kvm, 76)) return; + vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); + vcpu->arch.sie_block->eca &= ~ECA_APIE; + + if (vcpu->kvm->arch.crypto.apie) + vcpu->arch.sie_block->eca |= ECA_APIE; + /* Set up protected key support */ if (vcpu->kvm->arch.crypto.aes_kw) vcpu->arch.sie_block->ecb3 |= ECB3_AES; if (vcpu->kvm->arch.crypto.dea_kw) vcpu->arch.sie_block->ecb3 |= ECB3_DEA; - - vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; } void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) @@ -2685,6 +2725,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + vcpu->arch.sie_block->hpid = HPID_KVM; + kvm_s390_vcpu_crypto_setup(vcpu); return rc; @@ -2768,18 +2810,25 @@ static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu) exit_sie(vcpu); } +bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu) +{ + return atomic_read(&vcpu->arch.sie_block->prog20) & + (PROG_BLOCK_SIE | PROG_REQUEST); +} + static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu) { atomic_andnot(PROG_REQUEST, &vcpu->arch.sie_block->prog20); } /* - * Kick a guest cpu out of SIE and wait until SIE is not running. + * Kick a guest cpu out of (v)SIE and wait until (v)SIE is not running. * If the CPU is not running (e.g. waiting as idle) the function will * return immediately. */ void exit_sie(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); + kvm_s390_vsie_kick(vcpu); while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE) cpu_relax(); } @@ -3196,6 +3245,8 @@ retry: /* nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_UNHALT, vcpu); + /* we left the vsie handler, nothing to do, just clear the request */ + kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu); return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 981e3ba97461..1f6e36cdce0d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -290,6 +290,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); +bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); void exit_sie(struct kvm_vcpu *vcpu); void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a2b28cd1e3fe..a153257bf7d9 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -135,14 +135,148 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) atomic_set(&scb_s->cpuflags, newflags); return 0; } +/* Copy to APCB FORMAT1 from APCB FORMAT0 */ +static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, + unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h) +{ + struct kvm_s390_apcb0 tmp; -/* + if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0))) + return -EFAULT; + + apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; + apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL; + apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL; + + return 0; + +} + +/** + * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 + * @vcpu: pointer to the virtual CPU + * @apcb_s: pointer to start of apcb in the shadow crycb + * @apcb_o: pointer to start of original apcb in the guest2 + * @apcb_h: pointer to start of apcb in the guest1 + * + * Returns 0 and -EFAULT on error reading guest apcb + */ +static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, + unsigned long apcb_o, unsigned long *apcb_h) +{ + if (read_guest_real(vcpu, apcb_o, apcb_s, + sizeof(struct kvm_s390_apcb0))) + return -EFAULT; + + bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0)); + + return 0; +} + +/** + * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB + * @vcpu: pointer to the virtual CPU + * @apcb_s: pointer to start of apcb in the shadow crycb + * @apcb_o: pointer to start of original guest apcb + * @apcb_h: pointer to start of apcb in the host + * + * Returns 0 and -EFAULT on error reading guest apcb + */ +static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, + unsigned long apcb_o, + unsigned long *apcb_h) +{ + if (read_guest_real(vcpu, apcb_o, apcb_s, + sizeof(struct kvm_s390_apcb1))) + return -EFAULT; + + bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1)); + + return 0; +} + +/** + * setup_apcb - Create a shadow copy of the apcb. + * @vcpu: pointer to the virtual CPU + * @crycb_s: pointer to shadow crycb + * @crycb_o: pointer to original guest crycb + * @crycb_h: pointer to the host crycb + * @fmt_o: format of the original guest crycb. + * @fmt_h: format of the host crycb. + * + * Checks the compatibility between the guest and host crycb and calls the + * appropriate copy function. + * + * Return 0 or an error number if the guest and host crycb are incompatible. + */ +static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, + const u32 crycb_o, + struct kvm_s390_crypto_cb *crycb_h, + int fmt_o, int fmt_h) +{ + struct kvm_s390_crypto_cb *crycb; + + crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o; + + switch (fmt_o) { + case CRYCB_FORMAT2: + if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK)) + return -EACCES; + if (fmt_h != CRYCB_FORMAT2) + return -EINVAL; + return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, + (unsigned long) &crycb->apcb1, + (unsigned long *)&crycb_h->apcb1); + case CRYCB_FORMAT1: + switch (fmt_h) { + case CRYCB_FORMAT2: + return setup_apcb10(vcpu, &crycb_s->apcb1, + (unsigned long) &crycb->apcb0, + &crycb_h->apcb1); + case CRYCB_FORMAT1: + return setup_apcb00(vcpu, + (unsigned long *) &crycb_s->apcb0, + (unsigned long) &crycb->apcb0, + (unsigned long *) &crycb_h->apcb0); + } + break; + case CRYCB_FORMAT0: + if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK)) + return -EACCES; + + switch (fmt_h) { + case CRYCB_FORMAT2: + return setup_apcb10(vcpu, &crycb_s->apcb1, + (unsigned long) &crycb->apcb0, + &crycb_h->apcb1); + case CRYCB_FORMAT1: + case CRYCB_FORMAT0: + return setup_apcb00(vcpu, + (unsigned long *) &crycb_s->apcb0, + (unsigned long) &crycb->apcb0, + (unsigned long *) &crycb_h->apcb0); + } + } + return -EINVAL; +} + +/** + * shadow_crycb - Create a shadow copy of the crycb block + * @vcpu: a pointer to the virtual CPU + * @vsie_page: a pointer to internal date used for the vSIE + * * Create a shadow copy of the crycb block and setup key wrapping, if * requested for guest 3 and enabled for guest 2. * - * We only accept format-1 (no AP in g2), but convert it into format-2 + * We accept format-1 or format-2, but we convert format-1 into format-2 + * in the shadow CRYCB. + * Using format-2 enables the firmware to choose the right format when + * scheduling the SIE. * There is nothing to do for format-0. * + * This function centralize the issuing of set_validity_icpt() for all + * the subfunctions working on the crycb. + * * Returns: - 0 if shadowed or nothing to do * - > 0 if control has to be given to guest 2 */ @@ -154,23 +288,40 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) const u32 crycb_addr = crycbd_o & 0x7ffffff8U; unsigned long *b1, *b2; u8 ecb3_flags; + int apie_h; + int key_msk = test_kvm_facility(vcpu->kvm, 76); + int fmt_o = crycbd_o & CRYCB_FORMAT_MASK; + int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK; + int ret = 0; scb_s->crycbd = 0; - if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) - return 0; - /* format-1 is supported with message-security-assist extension 3 */ - if (!test_kvm_facility(vcpu->kvm, 76)) + + apie_h = vcpu->arch.sie_block->eca & ECA_APIE; + if (!apie_h && !key_msk) return 0; + + if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + if (fmt_o == CRYCB_FORMAT1) + if ((crycb_addr & PAGE_MASK) != + ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + + if (apie_h && (scb_o->eca & ECA_APIE)) { + ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr, + vcpu->kvm->arch.crypto.crycb, + fmt_o, fmt_h); + if (ret) + goto end; + scb_s->eca |= scb_o->eca & ECA_APIE; + } + /* we may only allow it if enabled for guest 2 */ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & (ECB3_AES | ECB3_DEA); if (!ecb3_flags) - return 0; - - if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK)) - return set_validity_icpt(scb_s, 0x003CU); - else if (!crycb_addr) - return set_validity_icpt(scb_s, 0x0039U); + goto end; /* copy only the wrapping keys */ if (read_guest_real(vcpu, crycb_addr + 72, @@ -178,8 +329,6 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0035U); scb_s->ecb3 |= ecb3_flags; - scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 | - CRYCB_FORMAT2; /* xor both blocks in one run */ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; @@ -187,6 +336,16 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; /* as 56%8 == 0, bitmap_xor won't overwrite any data */ bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); +end: + switch (ret) { + case -EINVAL: + return set_validity_icpt(scb_s, 0x0020U); + case -EFAULT: + return set_validity_icpt(scb_s, 0x0035U); + case -EACCES: + return set_validity_icpt(scb_s, 0x003CU); + } + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2; return 0; } @@ -383,6 +542,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (test_kvm_facility(vcpu->kvm, 156)) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; + scb_s->hpid = HPID_VSIE; + prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); out: @@ -830,7 +991,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; int guest_bp_isolation; - int rc; + int rc = 0; handle_last_fault(vcpu, vsie_page); @@ -858,7 +1019,18 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) guest_enter_irqoff(); local_irq_enable(); - rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + /* + * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking + * and VCPU requests also hinder the vSIE from running and lead + * to an immediate exit. kvm_s390_vsie_kick() has to be used to + * also kick the vSIE. + */ + vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; + barrier(); + if (!kvm_s390_vcpu_sie_inhibited(vcpu)) + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + barrier(); + vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; local_irq_disable(); guest_exit_irqoff(); @@ -1005,7 +1177,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (rc == -EAGAIN) rc = 0; if (rc || scb_s->icptcode || signal_pending(current) || - kvm_s390_vcpu_has_irq(vcpu, 0)) + kvm_s390_vcpu_has_irq(vcpu, 0) || + kvm_s390_vcpu_sie_inhibited(vcpu)) break; } @@ -1122,7 +1295,8 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (unlikely(scb_addr & 0x1ffUL)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0)) + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || + kvm_s390_vcpu_sie_inhibited(vcpu)) return 0; vsie_page = get_vsie_page(vcpu->kvm, scb_addr); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 911c7ded35f1..1e668b95e0c6 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -907,10 +907,16 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); - spin_lock(&gmap->guest_table_lock); pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + if (!pmdp) + return NULL; - if (!pmdp || pmd_none(*pmdp)) { + /* without huge pages, there is no need to take the table lock */ + if (!gmap->mm->context.allow_gmap_hpage_1m) + return pmd_none(*pmdp) ? NULL : pmdp; + + spin_lock(&gmap->guest_table_lock); + if (pmd_none(*pmdp)) { spin_unlock(&gmap->guest_table_lock); return NULL; } diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 0c85aedcf9b3..fd788e0f2e5b 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -106,6 +106,8 @@ static struct facility_def facility_defs[] = { .name = "FACILITIES_KVM_CPUMODEL", .bits = (int[]){ + 12, /* AP Query Configuration Information */ + 15, /* AP Facilities Test */ 156, /* etoken facility */ -1 /* END */ } diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c60395b7470f..83e6d993fca5 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -372,6 +372,14 @@ config S390_CCW_IOMMU Enables bits of IOMMU API required by VFIO. The iommu_ops is not implemented as it is not necessary for VFIO. +config S390_AP_IOMMU + bool "S390 AP IOMMU Support" + depends on S390 && ZCRYPT + select IOMMU_API + help + Enables bits of IOMMU API required by VFIO. The iommu_ops + is not implemented as it is not necessary for VFIO. + config MTK_IOMMU bool "MTK IOMMU Support" depends on ARM || ARM64 diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile index b59af548ed1c..8d36b05a7575 100644 --- a/drivers/s390/crypto/Makefile +++ b/drivers/s390/crypto/Makefile @@ -15,3 +15,7 @@ obj-$(CONFIG_ZCRYPT) += zcrypt_pcixcc.o zcrypt_cex2a.o zcrypt_cex4.o # pkey kernel module pkey-objs := pkey_api.o obj-$(CONFIG_PKEY) += pkey.o + +# adjunct processor matrix +vfio_ap-objs := vfio_ap_drv.o vfio_ap_ops.o +obj-$(CONFIG_VFIO_AP) += vfio_ap.o diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c new file mode 100644 index 000000000000..8b51821d9bf7 --- /dev/null +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VFIO based AP device driver + * + * Copyright IBM Corp. 2018 + * + * Author(s): Tony Krowiak <akrowiak@linux.ibm.com> + */ + +#include <linux/module.h> +#include <linux/mod_devicetable.h> +#include <linux/slab.h> +#include <linux/string.h> +#include "vfio_ap_private.h" + +#define VFIO_AP_ROOT_NAME "vfio_ap" +#define VFIO_AP_DEV_TYPE_NAME "ap_matrix" +#define VFIO_AP_DEV_NAME "matrix" + +MODULE_AUTHOR("IBM Corporation"); +MODULE_DESCRIPTION("VFIO AP device driver, Copyright IBM Corp. 2018"); +MODULE_LICENSE("GPL v2"); + +static struct ap_driver vfio_ap_drv; + +static struct device_type vfio_ap_dev_type = { + .name = VFIO_AP_DEV_TYPE_NAME, +}; + +struct ap_matrix_dev *matrix_dev; + +/* Only type 10 adapters (CEX4 and later) are supported + * by the AP matrix device driver + */ +static struct ap_device_id ap_queue_ids[] = { + { .dev_type = AP_DEVICE_TYPE_CEX4, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX5, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX6, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { /* end of sibling */ }, +}; + +MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids); + +static int vfio_ap_queue_dev_probe(struct ap_device *apdev) +{ + return 0; +} + +static void vfio_ap_queue_dev_remove(struct ap_device *apdev) +{ + /* Nothing to do yet */ +} + +static void vfio_ap_matrix_dev_release(struct device *dev) +{ + struct ap_matrix_dev *matrix_dev = dev_get_drvdata(dev); + + kfree(matrix_dev); +} + +static int vfio_ap_matrix_dev_create(void) +{ + int ret; + struct device *root_device; + + root_device = root_device_register(VFIO_AP_ROOT_NAME); + if (IS_ERR(root_device)) + return PTR_ERR(root_device); + + matrix_dev = kzalloc(sizeof(*matrix_dev), GFP_KERNEL); + if (!matrix_dev) { + ret = -ENOMEM; + goto matrix_alloc_err; + } + + /* Fill in config info via PQAP(QCI), if available */ + if (test_facility(12)) { + ret = ap_qci(&matrix_dev->info); + if (ret) + goto matrix_alloc_err; + } + + mutex_init(&matrix_dev->lock); + INIT_LIST_HEAD(&matrix_dev->mdev_list); + + matrix_dev->device.type = &vfio_ap_dev_type; + dev_set_name(&matrix_dev->device, "%s", VFIO_AP_DEV_NAME); + matrix_dev->device.parent = root_device; + matrix_dev->device.release = vfio_ap_matrix_dev_release; + matrix_dev->device.driver = &vfio_ap_drv.driver; + + ret = device_register(&matrix_dev->device); + if (ret) + goto matrix_reg_err; + + return 0; + +matrix_reg_err: + put_device(&matrix_dev->device); +matrix_alloc_err: + root_device_unregister(root_device); + + return ret; +} + +static void vfio_ap_matrix_dev_destroy(void) +{ + device_unregister(&matrix_dev->device); + root_device_unregister(matrix_dev->device.parent); +} + +int __init vfio_ap_init(void) +{ + int ret; + + /* If there are no AP instructions, there is nothing to pass through. */ + if (!ap_instructions_available()) + return -ENODEV; + + ret = vfio_ap_matrix_dev_create(); + if (ret) + return ret; + + memset(&vfio_ap_drv, 0, sizeof(vfio_ap_drv)); + vfio_ap_drv.probe = vfio_ap_queue_dev_probe; + vfio_ap_drv.remove = vfio_ap_queue_dev_remove; + vfio_ap_drv.ids = ap_queue_ids; + + ret = ap_driver_register(&vfio_ap_drv, THIS_MODULE, VFIO_AP_DRV_NAME); + if (ret) { + vfio_ap_matrix_dev_destroy(); + return ret; + } + + ret = vfio_ap_mdev_register(); + if (ret) { + ap_driver_unregister(&vfio_ap_drv); + vfio_ap_matrix_dev_destroy(); + + return ret; + } + + return 0; +} + +void __exit vfio_ap_exit(void) +{ + vfio_ap_mdev_unregister(); + ap_driver_unregister(&vfio_ap_drv); + vfio_ap_matrix_dev_destroy(); +} + +module_init(vfio_ap_init); +module_exit(vfio_ap_exit); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c new file mode 100644 index 000000000000..d3d9eb72b0f1 --- /dev/null +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -0,0 +1,968 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Adjunct processor matrix VFIO device driver callbacks. + * + * Copyright IBM Corp. 2018 + * + * Author(s): Tony Krowiak <akrowiak@linux.ibm.com> + * Halil Pasic <pasic@linux.ibm.com> + * Pierre Morel <pmorel@linux.ibm.com> + */ +#include <linux/string.h> +#include <linux/vfio.h> +#include <linux/device.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/bitops.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <asm/kvm.h> +#include <asm/zcrypt.h> + +#include "vfio_ap_private.h" + +#define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough" +#define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device" + +static void vfio_ap_matrix_init(struct ap_config_info *info, + struct ap_matrix *matrix) +{ + matrix->apm_max = info->apxa ? info->Na : 63; + matrix->aqm_max = info->apxa ? info->Nd : 15; + matrix->adm_max = info->apxa ? info->Nd : 15; +} + +static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev) +{ + struct ap_matrix_mdev *matrix_mdev; + + if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) + return -EPERM; + + matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL); + if (!matrix_mdev) { + atomic_inc(&matrix_dev->available_instances); + return -ENOMEM; + } + + vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); + mdev_set_drvdata(mdev, matrix_mdev); + mutex_lock(&matrix_dev->lock); + list_add(&matrix_mdev->node, &matrix_dev->mdev_list); + mutex_unlock(&matrix_dev->lock); + + return 0; +} + +static int vfio_ap_mdev_remove(struct mdev_device *mdev) +{ + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + if (matrix_mdev->kvm) + return -EBUSY; + + mutex_lock(&matrix_dev->lock); + list_del(&matrix_mdev->node); + mutex_unlock(&matrix_dev->lock); + + kfree(matrix_mdev); + mdev_set_drvdata(mdev, NULL); + atomic_inc(&matrix_dev->available_instances); + + return 0; +} + +static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + return sprintf(buf, "%s\n", VFIO_AP_MDEV_NAME_HWVIRT); +} + +MDEV_TYPE_ATTR_RO(name); + +static ssize_t available_instances_show(struct kobject *kobj, + struct device *dev, char *buf) +{ + return sprintf(buf, "%d\n", + atomic_read(&matrix_dev->available_instances)); +} + +MDEV_TYPE_ATTR_RO(available_instances); + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_AP_STRING); +} + +MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *vfio_ap_mdev_type_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_available_instances.attr, + NULL, +}; + +static struct attribute_group vfio_ap_mdev_hwvirt_type_group = { + .name = VFIO_AP_MDEV_TYPE_HWVIRT, + .attrs = vfio_ap_mdev_type_attrs, +}; + +static struct attribute_group *vfio_ap_mdev_type_groups[] = { + &vfio_ap_mdev_hwvirt_type_group, + NULL, +}; + +struct vfio_ap_queue_reserved { + unsigned long *apid; + unsigned long *apqi; + bool reserved; +}; + +/** + * vfio_ap_has_queue + * + * @dev: an AP queue device + * @data: a struct vfio_ap_queue_reserved reference + * + * Flags whether the AP queue device (@dev) has a queue ID containing the APQN, + * apid or apqi specified in @data: + * + * - If @data contains both an apid and apqi value, then @data will be flagged + * as reserved if the APID and APQI fields for the AP queue device matches + * + * - If @data contains only an apid value, @data will be flagged as + * reserved if the APID field in the AP queue device matches + * + * - If @data contains only an apqi value, @data will be flagged as + * reserved if the APQI field in the AP queue device matches + * + * Returns 0 to indicate the input to function succeeded. Returns -EINVAL if + * @data does not contain either an apid or apqi. + */ +static int vfio_ap_has_queue(struct device *dev, void *data) +{ + struct vfio_ap_queue_reserved *qres = data; + struct ap_queue *ap_queue = to_ap_queue(dev); + ap_qid_t qid; + unsigned long id; + + if (qres->apid && qres->apqi) { + qid = AP_MKQID(*qres->apid, *qres->apqi); + if (qid == ap_queue->qid) + qres->reserved = true; + } else if (qres->apid && !qres->apqi) { + id = AP_QID_CARD(ap_queue->qid); + if (id == *qres->apid) + qres->reserved = true; + } else if (!qres->apid && qres->apqi) { + id = AP_QID_QUEUE(ap_queue->qid); + if (id == *qres->apqi) + qres->reserved = true; + } else { + return -EINVAL; + } + + return 0; +} + +/** + * vfio_ap_verify_queue_reserved + * + * @matrix_dev: a mediated matrix device + * @apid: an AP adapter ID + * @apqi: an AP queue index + * + * Verifies that the AP queue with @apid/@apqi is reserved by the VFIO AP device + * driver according to the following rules: + * + * - If both @apid and @apqi are not NULL, then there must be an AP queue + * device bound to the vfio_ap driver with the APQN identified by @apid and + * @apqi + * + * - If only @apid is not NULL, then there must be an AP queue device bound + * to the vfio_ap driver with an APQN containing @apid + * + * - If only @apqi is not NULL, then there must be an AP queue device bound + * to the vfio_ap driver with an APQN containing @apqi + * + * Returns 0 if the AP queue is reserved; otherwise, returns -EADDRNOTAVAIL. + */ +static int vfio_ap_verify_queue_reserved(unsigned long *apid, + unsigned long *apqi) +{ + int ret; + struct vfio_ap_queue_reserved qres; + + qres.apid = apid; + qres.apqi = apqi; + qres.reserved = false; + + ret = driver_for_each_device(matrix_dev->device.driver, NULL, &qres, + vfio_ap_has_queue); + if (ret) + return ret; + + if (qres.reserved) + return 0; + + return -EADDRNOTAVAIL; +} + +static int +vfio_ap_mdev_verify_queues_reserved_for_apid(struct ap_matrix_mdev *matrix_mdev, + unsigned long apid) +{ + int ret; + unsigned long apqi; + unsigned long nbits = matrix_mdev->matrix.aqm_max + 1; + + if (find_first_bit_inv(matrix_mdev->matrix.aqm, nbits) >= nbits) + return vfio_ap_verify_queue_reserved(&apid, NULL); + + for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, nbits) { + ret = vfio_ap_verify_queue_reserved(&apid, &apqi); + if (ret) + return ret; + } + + return 0; +} + +/** + * vfio_ap_mdev_verify_no_sharing + * + * Verifies that the APQNs derived from the cross product of the AP adapter IDs + * and AP queue indexes comprising the AP matrix are not configured for another + * mediated device. AP queue sharing is not allowed. + * + * @matrix_mdev: the mediated matrix device + * + * Returns 0 if the APQNs are not shared, otherwise; returns -EADDRINUSE. + */ +static int vfio_ap_mdev_verify_no_sharing(struct ap_matrix_mdev *matrix_mdev) +{ + struct ap_matrix_mdev *lstdev; + DECLARE_BITMAP(apm, AP_DEVICES); + DECLARE_BITMAP(aqm, AP_DOMAINS); + + list_for_each_entry(lstdev, &matrix_dev->mdev_list, node) { + if (matrix_mdev == lstdev) + continue; + + memset(apm, 0, sizeof(apm)); + memset(aqm, 0, sizeof(aqm)); + + /* + * We work on full longs, as we can only exclude the leftover + * bits in non-inverse order. The leftover is all zeros. + */ + if (!bitmap_and(apm, matrix_mdev->matrix.apm, + lstdev->matrix.apm, AP_DEVICES)) + continue; + + if (!bitmap_and(aqm, matrix_mdev->matrix.aqm, + lstdev->matrix.aqm, AP_DOMAINS)) + continue; + + return -EADDRINUSE; + } + + return 0; +} + +/** + * assign_adapter_store + * + * @dev: the matrix device + * @attr: the mediated matrix device's assign_adapter attribute + * @buf: a buffer containing the AP adapter number (APID) to + * be assigned + * @count: the number of bytes in @buf + * + * Parses the APID from @buf and sets the corresponding bit in the mediated + * matrix device's APM. + * + * Returns the number of bytes processed if the APID is valid; otherwise, + * returns one of the following errors: + * + * 1. -EINVAL + * The APID is not a valid number + * + * 2. -ENODEV + * The APID exceeds the maximum value configured for the system + * + * 3. -EADDRNOTAVAIL + * An APQN derived from the cross product of the APID being assigned + * and the APQIs previously assigned is not bound to the vfio_ap device + * driver; or, if no APQIs have yet been assigned, the APID is not + * contained in an APQN bound to the vfio_ap device driver. + * + * 4. -EADDRINUSE + * An APQN derived from the cross product of the APID being assigned + * and the APQIs previously assigned is being used by another mediated + * matrix device + */ +static ssize_t assign_adapter_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long apid; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + /* If the guest is running, disallow assignment of adapter */ + if (matrix_mdev->kvm) + return -EBUSY; + + ret = kstrtoul(buf, 0, &apid); + if (ret) + return ret; + + if (apid > matrix_mdev->matrix.apm_max) + return -ENODEV; + + /* + * Set the bit in the AP mask (APM) corresponding to the AP adapter + * number (APID). The bits in the mask, from most significant to least + * significant bit, correspond to APIDs 0-255. + */ + mutex_lock(&matrix_dev->lock); + + ret = vfio_ap_mdev_verify_queues_reserved_for_apid(matrix_mdev, apid); + if (ret) + goto done; + + set_bit_inv(apid, matrix_mdev->matrix.apm); + + ret = vfio_ap_mdev_verify_no_sharing(matrix_mdev); + if (ret) + goto share_err; + + ret = count; + goto done; + +share_err: + clear_bit_inv(apid, matrix_mdev->matrix.apm); +done: + mutex_unlock(&matrix_dev->lock); + + return ret; +} +static DEVICE_ATTR_WO(assign_adapter); + +/** + * unassign_adapter_store + * + * @dev: the matrix device + * @attr: the mediated matrix device's unassign_adapter attribute + * @buf: a buffer containing the adapter number (APID) to be unassigned + * @count: the number of bytes in @buf + * + * Parses the APID from @buf and clears the corresponding bit in the mediated + * matrix device's APM. + * + * Returns the number of bytes processed if the APID is valid; otherwise, + * returns one of the following errors: + * -EINVAL if the APID is not a number + * -ENODEV if the APID it exceeds the maximum value configured for the + * system + */ +static ssize_t unassign_adapter_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long apid; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + /* If the guest is running, disallow un-assignment of adapter */ + if (matrix_mdev->kvm) + return -EBUSY; + + ret = kstrtoul(buf, 0, &apid); + if (ret) + return ret; + + if (apid > matrix_mdev->matrix.apm_max) + return -ENODEV; + + mutex_lock(&matrix_dev->lock); + clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm); + mutex_unlock(&matrix_dev->lock); + + return count; +} +DEVICE_ATTR_WO(unassign_adapter); + +static int +vfio_ap_mdev_verify_queues_reserved_for_apqi(struct ap_matrix_mdev *matrix_mdev, + unsigned long apqi) +{ + int ret; + unsigned long apid; + unsigned long nbits = matrix_mdev->matrix.apm_max + 1; + + if (find_first_bit_inv(matrix_mdev->matrix.apm, nbits) >= nbits) + return vfio_ap_verify_queue_reserved(NULL, &apqi); + + for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, nbits) { + ret = vfio_ap_verify_queue_reserved(&apid, &apqi); + if (ret) + return ret; + } + + return 0; +} + +/** + * assign_domain_store + * + * @dev: the matrix device + * @attr: the mediated matrix device's assign_domain attribute + * @buf: a buffer containing the AP queue index (APQI) of the domain to + * be assigned + * @count: the number of bytes in @buf + * + * Parses the APQI from @buf and sets the corresponding bit in the mediated + * matrix device's AQM. + * + * Returns the number of bytes processed if the APQI is valid; otherwise returns + * one of the following errors: + * + * 1. -EINVAL + * The APQI is not a valid number + * + * 2. -ENODEV + * The APQI exceeds the maximum value configured for the system + * + * 3. -EADDRNOTAVAIL + * An APQN derived from the cross product of the APQI being assigned + * and the APIDs previously assigned is not bound to the vfio_ap device + * driver; or, if no APIDs have yet been assigned, the APQI is not + * contained in an APQN bound to the vfio_ap device driver. + * + * 4. -EADDRINUSE + * An APQN derived from the cross product of the APQI being assigned + * and the APIDs previously assigned is being used by another mediated + * matrix device + */ +static ssize_t assign_domain_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long apqi; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + unsigned long max_apqi = matrix_mdev->matrix.aqm_max; + + /* If the guest is running, disallow assignment of domain */ + if (matrix_mdev->kvm) + return -EBUSY; + + ret = kstrtoul(buf, 0, &apqi); + if (ret) + return ret; + if (apqi > max_apqi) + return -ENODEV; + + mutex_lock(&matrix_dev->lock); + + ret = vfio_ap_mdev_verify_queues_reserved_for_apqi(matrix_mdev, apqi); + if (ret) + goto done; + + set_bit_inv(apqi, matrix_mdev->matrix.aqm); + + ret = vfio_ap_mdev_verify_no_sharing(matrix_mdev); + if (ret) + goto share_err; + + ret = count; + goto done; + +share_err: + clear_bit_inv(apqi, matrix_mdev->matrix.aqm); +done: + mutex_unlock(&matrix_dev->lock); + + return ret; +} +DEVICE_ATTR_WO(assign_domain); + + +/** + * unassign_domain_store + * + * @dev: the matrix device + * @attr: the mediated matrix device's unassign_domain attribute + * @buf: a buffer containing the AP queue index (APQI) of the domain to + * be unassigned + * @count: the number of bytes in @buf + * + * Parses the APQI from @buf and clears the corresponding bit in the + * mediated matrix device's AQM. + * + * Returns the number of bytes processed if the APQI is valid; otherwise, + * returns one of the following errors: + * -EINVAL if the APQI is not a number + * -ENODEV if the APQI exceeds the maximum value configured for the system + */ +static ssize_t unassign_domain_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long apqi; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + /* If the guest is running, disallow un-assignment of domain */ + if (matrix_mdev->kvm) + return -EBUSY; + + ret = kstrtoul(buf, 0, &apqi); + if (ret) + return ret; + + if (apqi > matrix_mdev->matrix.aqm_max) + return -ENODEV; + + mutex_lock(&matrix_dev->lock); + clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm); + mutex_unlock(&matrix_dev->lock); + + return count; +} +DEVICE_ATTR_WO(unassign_domain); + +/** + * assign_control_domain_store + * + * @dev: the matrix device + * @attr: the mediated matrix device's assign_control_domain attribute + * @buf: a buffer containing the domain ID to be assigned + * @count: the number of bytes in @buf + * + * Parses the domain ID from @buf and sets the corresponding bit in the mediated + * matrix device's ADM. + * + * Returns the number of bytes processed if the domain ID is valid; otherwise, + * returns one of the following errors: + * -EINVAL if the ID is not a number + * -ENODEV if the ID exceeds the maximum value configured for the system + */ +static ssize_t assign_control_domain_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long id; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + /* If the guest is running, disallow assignment of control domain */ + if (matrix_mdev->kvm) + return -EBUSY; + + ret = kstrtoul(buf, 0, &id); + if (ret) + return ret; + + if (id > matrix_mdev->matrix.adm_max) + return -ENODEV; + + /* Set the bit in the ADM (bitmask) corresponding to the AP control + * domain number (id). The bits in the mask, from most significant to + * least significant, correspond to IDs 0 up to the one less than the + * number of control domains that can be assigned. + */ + mutex_lock(&matrix_dev->lock); + set_bit_inv(id, matrix_mdev->matrix.adm); + mutex_unlock(&matrix_dev->lock); + + return count; +} +DEVICE_ATTR_WO(assign_control_domain); + +/** + * unassign_control_domain_store + * + * @dev: the matrix device + * @attr: the mediated matrix device's unassign_control_domain attribute + * @buf: a buffer containing the domain ID to be unassigned + * @count: the number of bytes in @buf + * + * Parses the domain ID from @buf and clears the corresponding bit in the + * mediated matrix device's ADM. + * + * Returns the number of bytes processed if the domain ID is valid; otherwise, + * returns one of the following errors: + * -EINVAL if the ID is not a number + * -ENODEV if the ID exceeds the maximum value configured for the system + */ +static ssize_t unassign_control_domain_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long domid; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + unsigned long max_domid = matrix_mdev->matrix.adm_max; + + /* If the guest is running, disallow un-assignment of control domain */ + if (matrix_mdev->kvm) + return -EBUSY; + + ret = kstrtoul(buf, 0, &domid); + if (ret) + return ret; + if (domid > max_domid) + return -ENODEV; + + mutex_lock(&matrix_dev->lock); + clear_bit_inv(domid, matrix_mdev->matrix.adm); + mutex_unlock(&matrix_dev->lock); + + return count; +} +DEVICE_ATTR_WO(unassign_control_domain); + +static ssize_t control_domains_show(struct device *dev, + struct device_attribute *dev_attr, + char *buf) +{ + unsigned long id; + int nchars = 0; + int n; + char *bufpos = buf; + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + unsigned long max_domid = matrix_mdev->matrix.adm_max; + + mutex_lock(&matrix_dev->lock); + for_each_set_bit_inv(id, matrix_mdev->matrix.adm, max_domid + 1) { + n = sprintf(bufpos, "%04lx\n", id); + bufpos += n; + nchars += n; + } + mutex_unlock(&matrix_dev->lock); + + return nchars; +} +DEVICE_ATTR_RO(control_domains); + +static ssize_t matrix_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct mdev_device *mdev = mdev_from_dev(dev); + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + char *bufpos = buf; + unsigned long apid; + unsigned long apqi; + unsigned long apid1; + unsigned long apqi1; + unsigned long napm_bits = matrix_mdev->matrix.apm_max + 1; + unsigned long naqm_bits = matrix_mdev->matrix.aqm_max + 1; + int nchars = 0; + int n; + + apid1 = find_first_bit_inv(matrix_mdev->matrix.apm, napm_bits); + apqi1 = find_first_bit_inv(matrix_mdev->matrix.aqm, naqm_bits); + + mutex_lock(&matrix_dev->lock); + + if ((apid1 < napm_bits) && (apqi1 < naqm_bits)) { + for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) { + for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, + naqm_bits) { + n = sprintf(bufpos, "%02lx.%04lx\n", apid, + apqi); + bufpos += n; + nchars += n; + } + } + } else if (apid1 < napm_bits) { + for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) { + n = sprintf(bufpos, "%02lx.\n", apid); + bufpos += n; + nchars += n; + } + } else if (apqi1 < naqm_bits) { + for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, naqm_bits) { + n = sprintf(bufpos, ".%04lx\n", apqi); + bufpos += n; + nchars += n; + } + } + + mutex_unlock(&matrix_dev->lock); + + return nchars; +} +DEVICE_ATTR_RO(matrix); + +static struct attribute *vfio_ap_mdev_attrs[] = { + &dev_attr_assign_adapter.attr, + &dev_attr_unassign_adapter.attr, + &dev_attr_assign_domain.attr, + &dev_attr_unassign_domain.attr, + &dev_attr_assign_control_domain.attr, + &dev_attr_unassign_control_domain.attr, + &dev_attr_control_domains.attr, + &dev_attr_matrix.attr, + NULL, +}; + +static struct attribute_group vfio_ap_mdev_attr_group = { + .attrs = vfio_ap_mdev_attrs +}; + +static const struct attribute_group *vfio_ap_mdev_attr_groups[] = { + &vfio_ap_mdev_attr_group, + NULL +}; + +static void vfio_ap_mdev_copy_masks(struct ap_matrix_mdev *matrix_mdev) +{ + int nbytes; + unsigned long *apm, *aqm, *adm; + struct kvm_s390_crypto_cb *crycb = matrix_mdev->kvm->arch.crypto.crycb; + + switch (matrix_mdev->kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { + case CRYCB_FORMAT2: + apm = (unsigned long *)crycb->apcb1.apm; + aqm = (unsigned long *)crycb->apcb1.aqm; + adm = (unsigned long *)crycb->apcb1.adm; + break; + case CRYCB_FORMAT1: + case CRYCB_FORMAT0: + apm = (unsigned long *)crycb->apcb0.apm; + aqm = (unsigned long *)crycb->apcb0.aqm; + adm = (unsigned long *)crycb->apcb0.adm; + break; + default: + /* cannot happen */ + return; + } + + nbytes = DIV_ROUND_UP(matrix_mdev->matrix.apm_max + 1, BITS_PER_BYTE); + memcpy(apm, matrix_mdev->matrix.apm, nbytes); + nbytes = DIV_ROUND_UP(matrix_mdev->matrix.aqm_max + 1, BITS_PER_BYTE); + memcpy(aqm, matrix_mdev->matrix.aqm, nbytes); + nbytes = DIV_ROUND_UP(matrix_mdev->matrix.adm_max + 1, BITS_PER_BYTE); + memcpy(adm, matrix_mdev->matrix.adm, nbytes); +} + +/** + * vfio_ap_mdev_set_kvm + * + * @matrix_mdev: a mediated matrix device + * @kvm: reference to KVM instance + * + * Verifies no other mediated matrix device has @kvm and sets a reference to + * it in @matrix_mdev->kvm. + * + * Return 0 if no other mediated matrix device has a reference to @kvm; + * otherwise, returns an -EPERM. + */ +static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev, + struct kvm *kvm) +{ + struct ap_matrix_mdev *m; + + mutex_lock(&matrix_dev->lock); + + list_for_each_entry(m, &matrix_dev->mdev_list, node) { + if ((m != matrix_mdev) && (m->kvm == kvm)) { + mutex_unlock(&matrix_dev->lock); + return -EPERM; + } + } + + matrix_mdev->kvm = kvm; + mutex_unlock(&matrix_dev->lock); + + return 0; +} + +static int vfio_ap_mdev_group_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int ret; + struct ap_matrix_mdev *matrix_mdev; + + if (action != VFIO_GROUP_NOTIFY_SET_KVM) + return NOTIFY_OK; + + matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier); + + if (!data) { + matrix_mdev->kvm = NULL; + return NOTIFY_OK; + } + + ret = vfio_ap_mdev_set_kvm(matrix_mdev, data); + if (ret) + return NOTIFY_DONE; + + /* If there is no CRYCB pointer, then we can't copy the masks */ + if (!matrix_mdev->kvm->arch.crypto.crycbd) + return NOTIFY_DONE; + + vfio_ap_mdev_copy_masks(matrix_mdev); + + return NOTIFY_OK; +} + +static int vfio_ap_mdev_reset_queue(unsigned int apid, unsigned int apqi, + unsigned int retry) +{ + struct ap_queue_status status; + + do { + status = ap_zapq(AP_MKQID(apid, apqi)); + switch (status.response_code) { + case AP_RESPONSE_NORMAL: + return 0; + case AP_RESPONSE_RESET_IN_PROGRESS: + case AP_RESPONSE_BUSY: + msleep(20); + break; + default: + /* things are really broken, give up */ + return -EIO; + } + } while (retry--); + + return -EBUSY; +} + +static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev) +{ + int ret; + int rc = 0; + unsigned long apid, apqi; + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, + matrix_mdev->matrix.apm_max + 1) { + for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, + matrix_mdev->matrix.aqm_max + 1) { + ret = vfio_ap_mdev_reset_queue(apid, apqi, 1); + /* + * Regardless whether a queue turns out to be busy, or + * is not operational, we need to continue resetting + * the remaining queues. + */ + if (ret) + rc = ret; + } + } + + return rc; +} + +static int vfio_ap_mdev_open(struct mdev_device *mdev) +{ + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + unsigned long events; + int ret; + + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier; + events = VFIO_GROUP_NOTIFY_SET_KVM; + + ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, + &events, &matrix_mdev->group_notifier); + if (ret) { + module_put(THIS_MODULE); + return ret; + } + + return 0; +} + +static void vfio_ap_mdev_release(struct mdev_device *mdev) +{ + struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); + + if (matrix_mdev->kvm) + kvm_arch_crypto_clear_masks(matrix_mdev->kvm); + + vfio_ap_mdev_reset_queues(mdev); + vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, + &matrix_mdev->group_notifier); + matrix_mdev->kvm = NULL; + module_put(THIS_MODULE); +} + +static int vfio_ap_mdev_get_device_info(unsigned long arg) +{ + unsigned long minsz; + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_AP | VFIO_DEVICE_FLAGS_RESET; + info.num_regions = 0; + info.num_irqs = 0; + + return copy_to_user((void __user *)arg, &info, minsz); +} + +static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev, + unsigned int cmd, unsigned long arg) +{ + int ret; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + ret = vfio_ap_mdev_get_device_info(arg); + break; + case VFIO_DEVICE_RESET: + ret = vfio_ap_mdev_reset_queues(mdev); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + return ret; +} + +static const struct mdev_parent_ops vfio_ap_matrix_ops = { + .owner = THIS_MODULE, + .supported_type_groups = vfio_ap_mdev_type_groups, + .mdev_attr_groups = vfio_ap_mdev_attr_groups, + .create = vfio_ap_mdev_create, + .remove = vfio_ap_mdev_remove, + .open = vfio_ap_mdev_open, + .release = vfio_ap_mdev_release, + .ioctl = vfio_ap_mdev_ioctl, +}; + +int vfio_ap_mdev_register(void) +{ + atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT); + + return mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops); +} + +void vfio_ap_mdev_unregister(void) +{ + mdev_unregister_device(&matrix_dev->device); +} diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h new file mode 100644 index 000000000000..5675492233c7 --- /dev/null +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Private data and functions for adjunct processor VFIO matrix driver. + * + * Author(s): Tony Krowiak <akrowiak@linux.ibm.com> + * Halil Pasic <pasic@linux.ibm.com> + * + * Copyright IBM Corp. 2018 + */ + +#ifndef _VFIO_AP_PRIVATE_H_ +#define _VFIO_AP_PRIVATE_H_ + +#include <linux/types.h> +#include <linux/device.h> +#include <linux/mdev.h> +#include <linux/delay.h> +#include <linux/mutex.h> + +#include "ap_bus.h" + +#define VFIO_AP_MODULE_NAME "vfio_ap" +#define VFIO_AP_DRV_NAME "vfio_ap" + +/** + * ap_matrix_dev - the AP matrix device structure + * @device: generic device structure associated with the AP matrix device + * @available_instances: number of mediated matrix devices that can be created + * @info: the struct containing the output from the PQAP(QCI) instruction + * mdev_list: the list of mediated matrix devices created + * lock: mutex for locking the AP matrix device. This lock will be + * taken every time we fiddle with state managed by the vfio_ap + * driver, be it using @mdev_list or writing the state of a + * single ap_matrix_mdev device. It's quite coarse but we don't + * expect much contention. + */ +struct ap_matrix_dev { + struct device device; + atomic_t available_instances; + struct ap_config_info info; + struct list_head mdev_list; + struct mutex lock; +}; + +extern struct ap_matrix_dev *matrix_dev; + +/** + * The AP matrix is comprised of three bit masks identifying the adapters, + * queues (domains) and control domains that belong to an AP matrix. The bits i + * each mask, from least significant to most significant bit, correspond to IDs + * 0 to 255. When a bit is set, the corresponding ID belongs to the matrix. + * + * @apm_max: max adapter number in @apm + * @apm identifies the AP adapters in the matrix + * @aqm_max: max domain number in @aqm + * @aqm identifies the AP queues (domains) in the matrix + * @adm_max: max domain number in @adm + * @adm identifies the AP control domains in the matrix + */ +struct ap_matrix { + unsigned long apm_max; + DECLARE_BITMAP(apm, 256); + unsigned long aqm_max; + DECLARE_BITMAP(aqm, 256); + unsigned long adm_max; + DECLARE_BITMAP(adm, 256); +}; + +/** + * struct ap_matrix_mdev - the mediated matrix device structure + * @list: allows the ap_matrix_mdev struct to be added to a list + * @matrix: the adapters, usage domains and control domains assigned to the + * mediated matrix device. + * @group_notifier: notifier block used for specifying callback function for + * handling the VFIO_GROUP_NOTIFY_SET_KVM event + * @kvm: the struct holding guest's state + */ +struct ap_matrix_mdev { + struct list_head node; + struct ap_matrix matrix; + struct notifier_block group_notifier; + struct kvm *kvm; +}; + +extern int vfio_ap_mdev_register(void); +extern void vfio_ap_mdev_unregister(void); + +#endif /* _VFIO_AP_PRIVATE_H_ */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 1aa7b82e8169..f378b9802d8b 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -200,6 +200,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ +#define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; @@ -215,6 +216,7 @@ struct vfio_device_info { #define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform" #define VFIO_DEVICE_API_AMBA_STRING "vfio-amba" #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" +#define VFIO_DEVICE_API_AP_STRING "vfio-ap" /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, |