diff options
115 files changed, 4223 insertions, 561 deletions
diff --git a/Documentation/s390/00-INDEX b/Documentation/s390/00-INDEX index 9189535f6cd2..317f0378ae01 100644 --- a/Documentation/s390/00-INDEX +++ b/Documentation/s390/00-INDEX @@ -22,5 +22,7 @@ qeth.txt - HiperSockets Bridge Port Support. s390dbf.txt - information on using the s390 debug feature. +vfio-ccw.txt + information on the vfio-ccw I/O subchannel driver. zfcpdump.txt - information on the s390 SCSI dump tool. diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.txt new file mode 100644 index 000000000000..90b3dfead81b --- /dev/null +++ b/Documentation/s390/vfio-ccw.txt @@ -0,0 +1,303 @@ +vfio-ccw: the basic infrastructure +================================== + +Introduction +------------ + +Here we describe the vfio support for I/O subchannel devices for +Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a +virtual machine, while vfio is the means. + +Different than other hardware architectures, s390 has defined a unified +I/O access method, which is so called Channel I/O. It has its own access +patterns: +- Channel programs run asynchronously on a separate (co)processor. +- The channel subsystem will access any memory designated by the caller + in the channel program directly, i.e. there is no iommu involved. +Thus when we introduce vfio support for these devices, we realize it +with a mediated device (mdev) implementation. The vfio mdev will be +added to an iommu group, so as to make itself able to be managed by the +vfio framework. And we add read/write callbacks for special vfio I/O +regions to pass the channel programs from the mdev to its parent device +(the real I/O subchannel device) to do further address translation and +to perform I/O instructions. + +This document does not intend to explain the s390 I/O architecture in +every detail. More information/reference could be found here: +- A good start to know Channel I/O in general: + https://en.wikipedia.org/wiki/Channel_I/O +- s390 architecture: + s390 Principles of Operation manual (IBM Form. No. SA22-7832) +- The existing Qemu code which implements a simple emulated channel + subsystem could also be a good reference. It makes it easier to follow + the flow. + qemu/hw/s390x/css.c + +For vfio mediated device framework: +- Documentation/vfio-mediated-device.txt + +Motivation of vfio-ccw +---------------------- + +Currently, a guest virtualized via qemu/kvm on s390 only sees +paravirtualized virtio devices via the "Virtio Over Channel I/O +(virtio-ccw)" transport. This makes virtio devices discoverable via +standard operating system algorithms for handling channel devices. + +However this is not enough. On s390 for the majority of devices, which +use the standard Channel I/O based mechanism, we also need to provide +the functionality of passing through them to a Qemu virtual machine. +This includes devices that don't have a virtio counterpart (e.g. tape +drives) or that have specific characteristics which guests want to +exploit. + +For passing a device to a guest, we want to use the same interface as +everybody else, namely vfio. Thus, we would like to introduce vfio +support for channel devices. And we would like to name this new vfio +device "vfio-ccw". + +Access patterns of CCW devices +------------------------------ + +s390 architecture has implemented a so called channel subsystem, that +provides a unified view of the devices physically attached to the +systems. Though the s390 hardware platform knows about a huge variety of +different peripheral attachments like disk devices (aka. DASDs), tapes, +communication controllers, etc. They can all be accessed by a well +defined access method and they are presenting I/O completion a unified +way: I/O interruptions. + +All I/O requires the use of channel command words (CCWs). A CCW is an +instruction to a specialized I/O channel processor. A channel program is +a sequence of CCWs which are executed by the I/O channel subsystem. To +issue a channel program to the channel subsystem, it is required to +build an operation request block (ORB), which can be used to point out +the format of the CCW and other control information to the system. The +operating system signals the I/O channel subsystem to begin executing +the channel program with a SSCH (start sub-channel) instruction. The +central processor is then free to proceed with non-I/O instructions +until interrupted. The I/O completion result is received by the +interrupt handler in the form of interrupt response block (IRB). + +Back to vfio-ccw, in short: +- ORBs and channel programs are built in guest kernel (with guest + physical addresses). +- ORBs and channel programs are passed to the host kernel. +- Host kernel translates the guest physical addresses to real addresses + and starts the I/O with issuing a privileged Channel I/O instruction + (e.g SSCH). +- channel programs run asynchronously on a separate processor. +- I/O completion will be signaled to the host with I/O interruptions. + And it will be copied as IRB to user space to pass it back to the + guest. + +Physical vfio ccw device and its child mdev +------------------------------------------- + +As mentioned above, we realize vfio-ccw with a mdev implementation. + +Channel I/O does not have IOMMU hardware support, so the physical +vfio-ccw device does not have an IOMMU level translation or isolation. + +Sub-channel I/O instructions are all privileged instructions, When +handling the I/O instruction interception, vfio-ccw has the software +policing and translation how the channel program is programmed before +it gets sent to hardware. + +Within this implementation, we have two drivers for two types of +devices: +- The vfio_ccw driver for the physical subchannel device. + This is an I/O subchannel driver for the real subchannel device. It + realizes a group of callbacks and registers to the mdev framework as a + parent (physical) device. As a consequence, mdev provides vfio_ccw a + generic interface (sysfs) to create mdev devices. A vfio mdev could be + created by vfio_ccw then and added to the mediated bus. It is the vfio + device that added to an IOMMU group and a vfio group. + vfio_ccw also provides an I/O region to accept channel program + request from user space and store I/O interrupt result for user + space to retrieve. To notify user space an I/O completion, it offers + an interface to setup an eventfd fd for asynchronous signaling. + +- The vfio_mdev driver for the mediated vfio ccw device. + This is provided by the mdev framework. It is a vfio device driver for + the mdev that created by vfio_ccw. + It realize a group of vfio device driver callbacks, adds itself to a + vfio group, and registers itself to the mdev framework as a mdev + driver. + It uses a vfio iommu backend that uses the existing map and unmap + ioctls, but rather than programming them into an IOMMU for a device, + it simply stores the translations for use by later requests. This + means that a device programmed in a VM with guest physical addresses + can have the vfio kernel convert that address to process virtual + address, pin the page and program the hardware with the host physical + address in one step. + For a mdev, the vfio iommu backend will not pin the pages during the + VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database + of the iova<->vaddr mappings in this operation. And they export a + vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu + backend for the physical devices to pin and unpin pages by demand. + +Below is a high Level block diagram. + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_device() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ccw.ko |<-> subchannel + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +The process of how these work together. +1. vfio_ccw.ko drives the physical I/O subchannel, and registers the + physical device (with callbacks) to mdev framework. + When vfio_ccw probing the subchannel device, it registers device + pointer and callbacks to the mdev framework. Mdev related file nodes + under the device node in sysfs would be created for the subchannel + device, namely 'mdev_create', 'mdev_destroy' and + 'mdev_supported_types'. +2. Create a mediated vfio ccw device. + Use the 'mdev_create' sysfs file, we need to manually create one (and + only one for our case) mediated device. +3. vfio_mdev.ko drives the mediated ccw device. + vfio_mdev is also the vfio device drvier. It will probe the mdev and + add it to an iommu_group and a vfio_group. Then we could pass through + the mdev to a guest. + +vfio-ccw I/O region +------------------- + +An I/O region is used to accept channel program request from user +space and store I/O interrupt result for user space to retrieve. The +defination of the region is: + +struct ccw_io_region { +#define ORB_AREA_SIZE 12 + __u8 orb_area[ORB_AREA_SIZE]; +#define SCSW_AREA_SIZE 12 + __u8 scsw_area[SCSW_AREA_SIZE]; +#define IRB_AREA_SIZE 96 + __u8 irb_area[IRB_AREA_SIZE]; + __u32 ret_code; +} __packed; + +While starting an I/O request, orb_area should be filled with the +guest ORB, and scsw_area should be filled with the SCSW of the Virtual +Subchannel. + +irb_area stores the I/O result. + +ret_code stores a return code for each access of the region. + +vfio-ccw patches overview +------------------------- + +For now, our patches are rebased on the latest mdev implementation. +vfio-ccw follows what vfio-pci did on the s390 paltform and uses +vfio-iommu-type1 as the vfio iommu backend. It's a good start to launch +the code review for vfio-ccw. Note that the implementation is far from +complete yet; but we'd like to get feedback for the general +architecture. + +* CCW translation APIs +- Description: + These introduce a group of APIs (start with 'cp_') to do CCW + translation. The CCWs passed in by a user space program are + organized with their guest physical memory addresses. These APIs + will copy the CCWs into the kernel space, and assemble a runnable + kernel channel program by updating the guest physical addresses with + their corresponding host physical addresses. +- Patches: + vfio: ccw: introduce channel program interfaces + +* vfio_ccw device driver +- Description: + The following patches utilizes the CCW translation APIs and introduce + vfio_ccw, which is the driver for the I/O subchannel devices you want + to pass through. + vfio_ccw implements the following vfio ioctls: + VFIO_DEVICE_GET_INFO + VFIO_DEVICE_GET_IRQ_INFO + VFIO_DEVICE_GET_REGION_INFO + VFIO_DEVICE_RESET + VFIO_DEVICE_SET_IRQS + This provides an I/O region, so that the user space program can pass a + channel program to the kernel, to do further CCW translation before + issuing them to a real device. + This also provides the SET_IRQ ioctl to setup an event notifier to + notify the user space program the I/O completion in an asynchronous + way. +- Patches: + vfio: ccw: basic implementation for vfio_ccw driver + vfio: ccw: introduce ccw_io_region + vfio: ccw: realize VFIO_DEVICE_GET_REGION_INFO ioctl + vfio: ccw: realize VFIO_DEVICE_RESET ioctl + vfio: ccw: realize VFIO_DEVICE_G(S)ET_IRQ_INFO ioctls + +The user of vfio-ccw is not limited to Qemu, while Qemu is definitely a +good example to get understand how these patches work. Here is a little +bit more detail how an I/O request triggered by the Qemu guest will be +handled (without error handling). + +Explanation: +Q1-Q7: Qemu side process. +K1-K5: Kernel side process. + +Q1. Get I/O region info during initialization. +Q2. Setup event notifier and handler to handle I/O completion. + +... ... + +Q3. Intercept a ssch instruction. +Q4. Write the guest channel program and ORB to the I/O region. + K1. Copy from guest to kernel. + K2. Translate the guest channel program to a host kernel space + channel program, which becomes runnable for a real device. + K3. With the necessary information contained in the orb passed in + by Qemu, issue the ccwchain to the device. + K4. Return the ssch CC code. +Q5. Return the CC code to the guest. + +... ... + + K5. Interrupt handler gets the I/O result and write the result to + the I/O region. + K6. Signal Qemu to retrieve the result. +Q6. Get the signal and event handler reads out the result from the I/O + region. +Q7. Update the irb for the guest. + +Limitations +----------- + +The current vfio-ccw implementation focuses on supporting basic commands +needed to implement block device functionality (read/write) of DASD/ECKD +device only. Some commands may need special handling in the future, for +example, anything related to path grouping. + +DASD is a kind of storage device. While ECKD is a data recording format. +More information for DASD and ECKD could be found here: +https://en.wikipedia.org/wiki/Direct-access_storage_device +https://en.wikipedia.org/wiki/Count_key_data + +Together with the corresponding work in Qemu, we can bring the passed +through DASD/ECKD device online in a guest now and use it as a block +device. + +Reference +--------- +1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832) +2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) +3. https://en.wikipedia.org/wiki/Channel_I/O +4. Documentation/s390/cds.txt +5. Documentation/vfio.txt +6. Documentation/vfio-mediated-device.txt diff --git a/MAINTAINERS b/MAINTAINERS index 33ecf266570f..5f91365ebc0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7201,6 +7201,7 @@ S: Supported F: Documentation/s390/kvm.txt F: arch/s390/include/asm/kvm* F: arch/s390/kvm/ +F: arch/s390/mm/gmap.c KERNEL VIRTUAL MACHINE (KVM) FOR ARM M: Christoffer Dall <christoffer.dall@linaro.org> @@ -10896,6 +10897,16 @@ W: http://www.ibm.com/developerworks/linux/linux390/ S: Supported F: drivers/iommu/s390-iommu.c +S390 VFIO-CCW DRIVER +M: Cornelia Huck <cornelia.huck@de.ibm.com> +M: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> +L: linux-s390@vger.kernel.org +L: kvm@vger.kernel.org +S: Supported +F: drivers/s390/cio/vfio_ccw* +F: Documentation/s390/vfio-ccw.txt +F: include/uapi/linux/vfio_ccw.h + S3C24XX SD/MMC Driver M: Ben Dooks <ben-linux@fluff.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index e256592eb66e..eae2c64cf69d 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -1,7 +1,7 @@ obj-y += kernel/ obj-y += mm/ obj-$(CONFIG_KVM) += kvm/ -obj-$(CONFIG_CRYPTO_HW) += crypto/ +obj-y += crypto/ obj-$(CONFIG_S390_HYPFS_FS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b8b143432381..e161fafb495b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -105,6 +105,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF @@ -123,7 +124,6 @@ config S390 select GENERIC_TIME_VSYSCALL select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL - select HAVE_ARCH_EARLY_PFN_TO_NID select HAVE_ARCH_JUMP_LABEL select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select HAVE_ARCH_SECCOMP_FILTER @@ -506,6 +506,21 @@ source kernel/Kconfig.preempt source kernel/Kconfig.hz +config ARCH_RANDOM + def_bool y + prompt "s390 architectural random number generation API" + help + Enable the s390 architectural random number generation API + to provide random data for all consumers within the Linux + kernel. + + When enabled the arch_random_* functions declared in linux/random.h + are implemented. The implementation is based on the s390 CPACF + instruction subfunction TRNG which provides a real true random + number generator. + + If unsure, say Y. + endmenu menu "Memory setup" @@ -536,6 +551,16 @@ config FORCE_MAX_ZONEORDER source "mm/Kconfig" +config MAX_PHYSMEM_BITS + int "Maximum size of supported physical memory in bits (42-53)" + range 42 53 + default "46" + help + This option specifies the maximum supported size of physical memory + in bits. Supported is any size between 2^42 (4TB) and 2^53 (8PB). + Increasing the number of bits also increases the kernel image size. + By default 46 bits (64TB) are supported. + config PACK_STACK def_bool y prompt "Pack kernel stack" @@ -613,7 +638,7 @@ if PCI config PCI_NR_FUNCTIONS int "Maximum number of PCI functions (1-4096)" range 1 4096 - default "64" + default "128" help This allows you to specify the maximum number of PCI functions which this kernel will support. @@ -671,6 +696,16 @@ config EADM_SCH To compile this driver as a module, choose M here: the module will be called eadm_sch. +config VFIO_CCW + def_tristate n + prompt "Support for VFIO-CCW subchannels" + depends on S390_CCW_IOMMU && VFIO_MDEV + help + This driver allows usage of I/O subchannels via VFIO-CCW. + + To compile this driver as a module, choose M here: the + module will be called vfio_ccw. + endmenu menu "Dump support" diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 4b176fe83da4..a5039fa89314 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -73,6 +73,7 @@ CONFIG_ZSWAP=y CONFIG_ZBUD=m CONFIG_ZSMALLOC=m CONFIG_ZSMALLOC_STAT=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PCI=y CONFIG_PCI_DEBUG=y diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 0de46cc397f6..83970b5afb2b 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -72,6 +72,7 @@ CONFIG_ZSWAP=y CONFIG_ZBUD=m CONFIG_ZSMALLOC=m CONFIG_ZSMALLOC_STAT=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PCI=y CONFIG_HOTPLUG_PCI=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index e167557b434c..fbc6542aaf59 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -70,6 +70,7 @@ CONFIG_ZSWAP=y CONFIG_ZBUD=m CONFIG_ZSMALLOC=m CONFIG_ZSMALLOC_STAT=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PCI=y CONFIG_HOTPLUG_PCI=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 4366a3e3e754..e23d97c13735 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -35,7 +35,6 @@ CONFIG_SCSI_ENCLOSURE=y CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y CONFIG_SCSI_FC_ATTRS=y -CONFIG_SCSI_SRP_ATTRS=y CONFIG_ZFCP=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set # CONFIG_INPUT_KEYBOARD is not set diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 402c530c6da5..678d9863e3f0 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -10,5 +10,6 @@ obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o paes_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o +obj-$(CONFIG_ARCH_RANDOM) += arch_random.o crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c new file mode 100644 index 000000000000..9317b3e645e2 --- /dev/null +++ b/arch/s390/crypto/arch_random.c @@ -0,0 +1,31 @@ +/* + * s390 arch random implementation. + * + * Copyright IBM Corp. 2017 + * Author(s): Harald Freudenberger <freude@de.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/static_key.h> +#include <asm/cpacf.h> + +DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); + +atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0); +EXPORT_SYMBOL(s390_arch_random_counter); + +static int __init s390_arch_random_init(void) +{ + /* check if subfunction CPACF_PRNO_TRNG is available */ + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + static_branch_enable(&s390_arch_random_available); + + return 0; +} +arch_initcall(s390_arch_random_init); diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index 716b17238599..a4e903ed7e21 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -616,7 +616,7 @@ out_err: module_init(paes_s390_init); module_exit(paes_s390_fini); -MODULE_ALIAS_CRYPTO("aes-all"); +MODULE_ALIAS_CRYPTO("paes"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys"); MODULE_LICENSE("GPL"); diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 5a3ec04a7082..3e47c4a0f18b 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -81,7 +81,7 @@ struct prng_ws_s { u64 byte_counter; }; -struct ppno_ws_s { +struct prno_ws_s { u32 res; u32 reseed_counter; u64 stream_bytes; @@ -93,7 +93,7 @@ struct prng_data_s { struct mutex mutex; union { struct prng_ws_s prngws; - struct ppno_ws_s ppnows; + struct prno_ws_s prnows; }; u8 *buf; u32 rest; @@ -306,12 +306,12 @@ static int __init prng_sha512_selftest(void) 0x36, 0x8c, 0x5a, 0x9f, 0x7a, 0x4b, 0x3e, 0xe2 }; u8 buf[sizeof(random)]; - struct ppno_ws_s ws; + struct prno_ws_s ws; memset(&ws, 0, sizeof(ws)); /* initial seed */ - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED, + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &ws, NULL, 0, seed, sizeof(seed)); /* check working states V and C */ @@ -324,9 +324,9 @@ static int __init prng_sha512_selftest(void) } /* generate random bytes */ - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &ws, buf, sizeof(buf), NULL, 0); - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &ws, buf, sizeof(buf), NULL, 0); /* check against expected data */ @@ -374,16 +374,16 @@ static int __init prng_sha512_instantiate(void) /* followed by 16 bytes of unique nonce */ get_tod_clock_ext(seed + 64 + 32); - /* initial seed of the ppno drng */ - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED, - &prng_data->ppnows, NULL, 0, seed, sizeof(seed)); + /* initial seed of the prno drng */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &prng_data->prnows, NULL, 0, seed, sizeof(seed)); /* if fips mode is enabled, generate a first block of random bytes for the FIPS 140-2 Conditional Self Test */ if (fips_enabled) { prng_data->prev = prng_data->buf + prng_chunk_size; - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, - &prng_data->ppnows, + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &prng_data->prnows, prng_data->prev, prng_chunk_size, NULL, 0); } @@ -412,9 +412,9 @@ static int prng_sha512_reseed(void) if (ret != sizeof(seed)) return ret; - /* do a reseed of the ppno drng with this bytestring */ - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_SEED, - &prng_data->ppnows, NULL, 0, seed, sizeof(seed)); + /* do a reseed of the prno drng with this bytestring */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &prng_data->prnows, NULL, 0, seed, sizeof(seed)); return 0; } @@ -425,15 +425,15 @@ static int prng_sha512_generate(u8 *buf, size_t nbytes) int ret; /* reseed needed ? */ - if (prng_data->ppnows.reseed_counter > prng_reseed_limit) { + if (prng_data->prnows.reseed_counter > prng_reseed_limit) { ret = prng_sha512_reseed(); if (ret) return ret; } - /* PPNO generate */ - cpacf_ppno(CPACF_PPNO_SHA512_DRNG_GEN, - &prng_data->ppnows, buf, nbytes, NULL, 0); + /* PRNO generate */ + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &prng_data->prnows, buf, nbytes, NULL, 0); /* FIPS 140-2 Conditional Self Test */ if (fips_enabled) { @@ -653,7 +653,7 @@ static ssize_t prng_counter_show(struct device *dev, if (mutex_lock_interruptible(&prng_data->mutex)) return -ERESTARTSYS; if (prng_mode == PRNG_MODE_SHA512) - counter = prng_data->ppnows.stream_bytes; + counter = prng_data->prnows.stream_bytes; else counter = prng_data->prngws.byte_counter; mutex_unlock(&prng_data->mutex); @@ -774,8 +774,8 @@ static int __init prng_init(void) /* choose prng mode */ if (prng_mode != PRNG_MODE_TDES) { - /* check for MSA5 support for PPNO operations */ - if (!cpacf_query_func(CPACF_PPNO, CPACF_PPNO_SHA512_DRNG_GEN)) { + /* check for MSA5 support for PRNO operations */ + if (!cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) { if (prng_mode == PRNG_MODE_SHA512) { pr_err("The prng module cannot " "start in SHA-512 mode\n"); diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 8aea32fe8bd2..7e3481eb2174 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -1,8 +1,14 @@ generic-y += asm-offsets.h generic-y += clkdev.h generic-y += dma-contiguous.h +generic-y += div64.h +generic-y += emergency-restart.h generic-y += export.h +generic-y += irq_regs.h generic-y += irq_work.h +generic-y += kmap_types.h +generic-y += local.h +generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h new file mode 100644 index 000000000000..6033901a40b2 --- /dev/null +++ b/arch/s390/include/asm/archrandom.h @@ -0,0 +1,69 @@ +/* + * Kernel interface for the s390 arch_random_* functions + * + * Copyright IBM Corp. 2017 + * + * Author: Harald Freudenberger <freude@de.ibm.com> + * + */ + +#ifndef _ASM_S390_ARCHRANDOM_H +#define _ASM_S390_ARCHRANDOM_H + +#ifdef CONFIG_ARCH_RANDOM + +#include <linux/static_key.h> +#include <linux/atomic.h> +#include <asm/cpacf.h> + +DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); +extern atomic64_t s390_arch_random_counter; + +static void s390_arch_random_generate(u8 *buf, unsigned int nbytes) +{ + cpacf_trng(NULL, 0, buf, nbytes); + atomic64_add(nbytes, &s390_arch_random_counter); +} + +static inline bool arch_has_random(void) +{ + if (static_branch_likely(&s390_arch_random_available)) + return true; + return false; +} + +static inline bool arch_has_random_seed(void) +{ + return arch_has_random(); +} + +static inline bool arch_get_random_long(unsigned long *v) +{ + if (static_branch_likely(&s390_arch_random_available)) { + s390_arch_random_generate((u8 *)v, sizeof(*v)); + return true; + } + return false; +} + +static inline bool arch_get_random_int(unsigned int *v) +{ + if (static_branch_likely(&s390_arch_random_available)) { + s390_arch_random_generate((u8 *)v, sizeof(*v)); + return true; + } + return false; +} + +static inline bool arch_get_random_seed_long(unsigned long *v) +{ + return arch_get_random_long(v); +} + +static inline bool arch_get_random_seed_int(unsigned int *v) +{ + return arch_get_random_int(v); +} + +#endif /* CONFIG_ARCH_RANDOM */ +#endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index ac9e2b939d04..ba6d29412344 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -111,20 +111,22 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") static inline int __atomic_cmpxchg(int *ptr, int old, int new) { - asm volatile( - " cs %[old],%[new],%[ptr]" - : [old] "+d" (old), [ptr] "+Q" (*ptr) - : [new] "d" (new) : "cc", "memory"); - return old; + return __sync_val_compare_and_swap(ptr, old, new); +} + +static inline int __atomic_cmpxchg_bool(int *ptr, int old, int new) +{ + return __sync_bool_compare_and_swap(ptr, old, new); } static inline long __atomic64_cmpxchg(long *ptr, long old, long new) { - asm volatile( - " csg %[old],%[new],%[ptr]" - : [old] "+d" (old), [ptr] "+Q" (*ptr) - : [new] "d" (new) : "cc", "memory"); - return old; + return __sync_val_compare_and_swap(ptr, old, new); +} + +static inline long __atomic64_cmpxchg_bool(long *ptr, long old, long new) +{ + return __sync_bool_compare_and_swap(ptr, old, new); } #endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d92047da5ccb..99902b7b9f0c 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -15,14 +15,6 @@ * end up numbered: * |63..............0|127............64|191...........128|255...........192| * - * There are a few little-endian macros used mostly for filesystem - * bitmaps, these work on similar bit array layouts, but byte-oriented: - * |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56| - * - * The main difference is that bit 3-5 in the bit number field needs to be - * reversed compared to the big-endian bit fields. This can be achieved by - * XOR with 0x38. - * * We also have special functions which work with an MSB0 encoding. * The bits are numbered: * |0..............63|64............127|128...........191|192...........255| @@ -253,6 +245,11 @@ unsigned long find_first_bit_inv(const unsigned long *addr, unsigned long size); unsigned long find_next_bit_inv(const unsigned long *addr, unsigned long size, unsigned long offset); +#define for_each_set_bit_inv(bit, addr, size) \ + for ((bit) = find_first_bit_inv((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit_inv((addr), (size), (bit) + 1)) + static inline void set_bit_inv(unsigned long nr, volatile unsigned long *ptr) { return set_bit(nr ^ (BITS_PER_LONG - 1), ptr); diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index f7ed88cc066e..7a38ca85190b 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -33,6 +33,24 @@ struct ccw1 { __u32 cda; } __attribute__ ((packed,aligned(8))); +/** + * struct ccw0 - channel command word + * @cmd_code: command code + * @cda: data address + * @flags: flags, like IDA addressing, etc. + * @reserved: will be ignored + * @count: byte count + * + * The format-0 ccw structure. + */ +struct ccw0 { + __u8 cmd_code; + __u32 cda : 24; + __u8 flags; + __u8 reserved; + __u16 count; +} __packed __aligned(8); + #define CCW_FLAG_DC 0x80 #define CCW_FLAG_CC 0x40 #define CCW_FLAG_SLI 0x20 diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index e2dfbf280d12..e06f2556b316 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -25,7 +25,8 @@ #define CPACF_KMO 0xb92b /* MSA4 */ #define CPACF_PCC 0xb92c /* MSA4 */ #define CPACF_KMCTR 0xb92d /* MSA4 */ -#define CPACF_PPNO 0xb93c /* MSA5 */ +#define CPACF_PRNO 0xb93c /* MSA5 */ +#define CPACF_KMA 0xb929 /* MSA8 */ /* * En/decryption modifier bits @@ -123,12 +124,14 @@ #define CPACF_PCKMO_ENC_AES_256_KEY 0x14 /* - * Function codes for the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION) + * Function codes for the PRNO (PERFORM RANDOM NUMBER OPERATION) * instruction */ -#define CPACF_PPNO_QUERY 0x00 -#define CPACF_PPNO_SHA512_DRNG_GEN 0x03 -#define CPACF_PPNO_SHA512_DRNG_SEED 0x83 +#define CPACF_PRNO_QUERY 0x00 +#define CPACF_PRNO_SHA512_DRNG_GEN 0x03 +#define CPACF_PRNO_SHA512_DRNG_SEED 0x83 +#define CPACF_PRNO_TRNG_Q_R2C_RATIO 0x70 +#define CPACF_PRNO_TRNG 0x72 typedef struct { unsigned char bytes[16]; } cpacf_mask_t; @@ -149,8 +152,8 @@ static inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) asm volatile( " spm 0\n" /* pckmo doesn't change the cc */ - /* Parameter registers are ignored, but may not be 0 */ - "0: .insn rrf,%[opc] << 16,2,2,2,0\n" + /* Parameter regs are ignored, but must be nonzero and unique */ + "0: .insn rrf,%[opc] << 16,2,4,6,0\n" " brc 1,0b\n" /* handle partial completion */ : "=m" (*mask) : [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode) @@ -173,7 +176,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode) case CPACF_PCC: case CPACF_KMCTR: return test_facility(77); /* check for MSA4 */ - case CPACF_PPNO: + case CPACF_PRNO: return test_facility(57); /* check for MSA5 */ default: BUG(); @@ -373,18 +376,18 @@ static inline int cpacf_kmctr(unsigned long func, void *param, u8 *dest, } /** - * cpacf_ppno() - executes the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION) + * cpacf_prno() - executes the PRNO (PERFORM RANDOM NUMBER OPERATION) * instruction - * @func: the function code passed to PPNO; see CPACF_PPNO_xxx defines + * @func: the function code passed to PRNO; see CPACF_PRNO_xxx defines * @param: address of parameter block; see POP for details on each func * @dest: address of destination memory area * @dest_len: size of destination memory area in bytes * @seed: address of seed data * @seed_len: size of seed data in bytes */ -static inline void cpacf_ppno(unsigned long func, void *param, - u8 *dest, long dest_len, - const u8 *seed, long seed_len) +static inline void cpacf_prno(unsigned long func, void *param, + u8 *dest, unsigned long dest_len, + const u8 *seed, unsigned long seed_len) { register unsigned long r0 asm("0") = (unsigned long) func; register unsigned long r1 asm("1") = (unsigned long) param; @@ -398,7 +401,32 @@ static inline void cpacf_ppno(unsigned long func, void *param, " brc 1,0b\n" /* handle partial completion */ : [dst] "+a" (r2), [dlen] "+d" (r3) : [fc] "d" (r0), [pba] "a" (r1), - [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PPNO) + [seed] "a" (r4), [slen] "d" (r5), [opc] "i" (CPACF_PRNO) + : "cc", "memory"); +} + +/** + * cpacf_trng() - executes the TRNG subfunction of the PRNO instruction + * @ucbuf: buffer for unconditioned data + * @ucbuf_len: amount of unconditioned data to fetch in bytes + * @cbuf: buffer for conditioned data + * @cbuf_len: amount of conditioned data to fetch in bytes + */ +static inline void cpacf_trng(u8 *ucbuf, unsigned long ucbuf_len, + u8 *cbuf, unsigned long cbuf_len) +{ + register unsigned long r0 asm("0") = (unsigned long) CPACF_PRNO_TRNG; + register unsigned long r2 asm("2") = (unsigned long) ucbuf; + register unsigned long r3 asm("3") = (unsigned long) ucbuf_len; + register unsigned long r4 asm("4") = (unsigned long) cbuf; + register unsigned long r5 asm("5") = (unsigned long) cbuf_len; + + asm volatile ( + "0: .insn rre,%[opc] << 16,%[ucbuf],%[cbuf]\n" + " brc 1,0b\n" /* handle partial completion */ + : [ucbuf] "+a" (r2), [ucbuflen] "+d" (r3), + [cbuf] "+a" (r4), [cbuflen] "+d" (r5) + : [fc] "d" (r0), [opc] "i" (CPACF_PRNO) : "cc", "memory"); } diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index d1e0707310fd..05480e4cc5ca 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -20,9 +20,11 @@ #define CPU_MF_INT_SF_PRA (1 << 29) /* program request alert */ #define CPU_MF_INT_SF_SACA (1 << 23) /* sampler auth. change alert */ #define CPU_MF_INT_SF_LSDA (1 << 22) /* loss of sample data alert */ +#define CPU_MF_INT_CF_MTDA (1 << 15) /* loss of MT ctr. data alert */ #define CPU_MF_INT_CF_CACA (1 << 7) /* counter auth. change alert */ #define CPU_MF_INT_CF_LCDA (1 << 6) /* loss of counter data alert */ -#define CPU_MF_INT_CF_MASK (CPU_MF_INT_CF_CACA|CPU_MF_INT_CF_LCDA) +#define CPU_MF_INT_CF_MASK (CPU_MF_INT_CF_MTDA|CPU_MF_INT_CF_CACA| \ + CPU_MF_INT_CF_LCDA) #define CPU_MF_INT_SF_MASK (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE| \ CPU_MF_INT_SF_PRA|CPU_MF_INT_SF_SACA| \ CPU_MF_INT_SF_LSDA) @@ -172,7 +174,7 @@ static inline int lcctl(u64 ctl) /* Extract CPU counter */ static inline int __ecctr(u64 ctr, u64 *content) { - register u64 _content asm("4") = 0; + u64 _content; int cc; asm volatile ( diff --git a/arch/s390/include/asm/div64.h b/arch/s390/include/asm/div64.h deleted file mode 100644 index 6cd978cefb28..000000000000 --- a/arch/s390/include/asm/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/div64.h> diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 1d48880b3cc1..e8f623041769 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -105,6 +105,7 @@ #define HWCAP_S390_VXRS 2048 #define HWCAP_S390_VXRS_BCD 4096 #define HWCAP_S390_VXRS_EXT 8192 +#define HWCAP_S390_GS 16384 /* Internal bits, not exposed via elf */ #define HWCAP_INT_SIE 1UL diff --git a/arch/s390/include/asm/emergency-restart.h b/arch/s390/include/asm/emergency-restart.h deleted file mode 100644 index 108d8c48e42e..000000000000 --- a/arch/s390/include/asm/emergency-restart.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_EMERGENCY_RESTART_H -#define _ASM_EMERGENCY_RESTART_H - -#include <asm-generic/emergency-restart.h> - -#endif /* _ASM_EMERGENCY_RESTART_H */ diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h index 09b406db7529..cb60d5c5755d 100644 --- a/arch/s390/include/asm/facility.h +++ b/arch/s390/include/asm/facility.h @@ -8,14 +8,11 @@ #define __ASM_FACILITY_H #include <generated/facilities.h> - -#ifndef __ASSEMBLY__ - #include <linux/string.h> #include <linux/preempt.h> #include <asm/lowcore.h> -#define MAX_FACILITY_BIT (256*8) /* stfle_fac_list has 256 bytes */ +#define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8) static inline int __test_facility(unsigned long nr, void *facilities) { @@ -72,5 +69,4 @@ static inline void stfle(u64 *stfle_fac_list, int size) preempt_enable(); } -#endif /* __ASSEMBLY__ */ #endif /* __ASM_FACILITY_H */ diff --git a/arch/s390/include/asm/irq_regs.h b/arch/s390/include/asm/irq_regs.h deleted file mode 100644 index 3dd9c0b70270..000000000000 --- a/arch/s390/include/asm/irq_regs.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/irq_regs.h> diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h index 68d7d68300f2..8a0b721a9b8d 100644 --- a/arch/s390/include/asm/isc.h +++ b/arch/s390/include/asm/isc.h @@ -16,6 +16,7 @@ #define CONSOLE_ISC 1 /* console I/O subchannel */ #define EADM_SCH_ISC 4 /* EADM subchannels */ #define CHSC_SCH_ISC 7 /* CHSC subchannels */ +#define VFIO_CCW_ISC IO_SCH_ISC /* VFIO-CCW I/O subchannels */ /* Adapter interrupts. */ #define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ #define PCI_ISC 2 /* PCI I/O subchannels */ diff --git a/arch/s390/include/asm/kmap_types.h b/arch/s390/include/asm/kmap_types.h deleted file mode 100644 index 0a88622339ee..000000000000 --- a/arch/s390/include/asm/kmap_types.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -#include <asm-generic/kmap_types.h> - -#endif diff --git a/arch/s390/include/asm/local.h b/arch/s390/include/asm/local.h deleted file mode 100644 index c11c530f74d0..000000000000 --- a/arch/s390/include/asm/local.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local.h> diff --git a/arch/s390/include/asm/local64.h b/arch/s390/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/s390/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local64.h> diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 61261e0e95c0..8a5b082797f8 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -157,8 +157,8 @@ struct lowcore { __u64 stfle_fac_list[32]; /* 0x0f00 */ __u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */ - /* Pointer to vector register save area */ - __u64 vector_save_area_addr; /* 0x11b0 */ + /* Pointer to the machine check extended save area */ + __u64 mcesad; /* 0x11b0 */ /* 64 bit extparam used for pfault/diag 250: defined by architecture */ __u64 ext_params2; /* 0x11B8 */ @@ -182,10 +182,7 @@ struct lowcore { /* Transaction abort diagnostic block */ __u8 pgm_tdb[256]; /* 0x1800 */ - __u8 pad_0x1900[0x1c00-0x1900]; /* 0x1900 */ - - /* Software defined save area for vector registers */ - __u8 vector_save_area[1024]; /* 0x1c00 */ + __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ } __packed; #define S390_lowcore (*((struct lowcore *) 0)) diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h index b55a59e1d134..b79813d9cf68 100644 --- a/arch/s390/include/asm/mman.h +++ b/arch/s390/include/asm/mman.h @@ -8,8 +8,4 @@ #include <uapi/asm/mman.h> -#ifndef __ASSEMBLY__ -int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags); -#define arch_mmap_check(addr, len, flags) s390_mmap_check(addr, len, flags) -#endif #endif /* __S390_MMAN_H__ */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bea785d7f853..bd6f30304518 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -22,6 +22,8 @@ typedef struct { unsigned int has_pgste:1; /* The mmu context uses storage keys. */ unsigned int use_skey:1; + /* The mmu context uses CMMA. */ + unsigned int use_cmma:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index fa2bf69be182..8712e11bead4 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -28,6 +28,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.alloc_pgste = page_table_allocate_pgste; mm->context.has_pgste = 0; mm->context.use_skey = 0; + mm->context.use_cmma = 0; #endif switch (mm->context.asce_limit) { case 1UL << 42: diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index b75fd910386a..e3e8895f5d3e 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -58,7 +58,9 @@ union mci { u64 ie : 1; /* 32 indirect storage error */ u64 ar : 1; /* 33 access register validity */ u64 da : 1; /* 34 delayed access exception */ - u64 : 7; /* 35-41 */ + u64 : 1; /* 35 */ + u64 gs : 1; /* 36 guarded storage registers */ + u64 : 5; /* 37-41 */ u64 pr : 1; /* 42 tod programmable register validity */ u64 fc : 1; /* 43 fp control register validity */ u64 ap : 1; /* 44 ancillary report */ @@ -69,6 +71,14 @@ union mci { }; }; +#define MCESA_ORIGIN_MASK (~0x3ffUL) +#define MCESA_LC_MASK (0xfUL) + +struct mcesa { + u8 vector_save_area[1024]; + u8 guarded_storage_save_area[32]; +}; + struct pt_regs; extern void s390_handle_mcck(void); diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h new file mode 100644 index 000000000000..42267a2fe29e --- /dev/null +++ b/arch/s390/include/asm/page-states.h @@ -0,0 +1,19 @@ +/* + * Copyright IBM Corp. 2017 + * Author(s): Claudio Imbrenda <imbrenda@linux.vnet.ibm.com> + */ + +#ifndef PAGE_STATES_H +#define PAGE_STATES_H + +#define ESSA_GET_STATE 0 +#define ESSA_SET_STABLE 1 +#define ESSA_SET_UNUSED 2 +#define ESSA_SET_VOLATILE 3 +#define ESSA_SET_POT_VOLATILE 4 +#define ESSA_SET_STABLE_RESIDENT 5 +#define ESSA_SET_STABLE_IF_RESIDENT 6 + +#define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT + +#endif diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index c64c0befd3f3..dd32beb9d30c 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -1,7 +1,7 @@ /* * Performance event support - s390 specific definitions. * - * Copyright IBM Corp. 2009, 2013 + * Copyright IBM Corp. 2009, 2017 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> * Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ @@ -47,7 +47,7 @@ struct perf_sf_sde_regs { }; /* Perf PMU definitions for the counter facility */ -#define PERF_CPUM_CF_MAX_CTR 256 +#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ /* Perf PMU definitions for the sampling facility */ #define PERF_CPUM_SF_MAX_CTR 2 diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ecec682bb516..e6e3b887bee3 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -372,10 +372,12 @@ static inline int is_module_addr(void *addr) #define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ -#define _PGSTE_GPS_ZERO 0x0000000080000000UL -#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL -#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL -#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL +#define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL +#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL +#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL +#define _PGSTE_GPS_USAGE_POT_VOLATILE 0x0000000002000000UL +#define _PGSTE_GPS_USAGE_VOLATILE _PGSTE_GPS_USAGE_MASK /* * A user page table pointer has the space-switch-event bit, the @@ -1041,6 +1043,12 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key); +int set_pgste_bits(struct mm_struct *mm, unsigned long addr, + unsigned long bits, unsigned long value); +int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); +int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste); + /* * Certain architectures need to do special things when PTEs * within a page table are directly modified. Thus, the following diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h index b48aef4188f6..4c484590d858 100644 --- a/arch/s390/include/asm/pkey.h +++ b/arch/s390/include/asm/pkey.h @@ -87,4 +87,25 @@ int pkey_findcard(const struct pkey_seckey *seckey, int pkey_skey2pkey(const struct pkey_seckey *seckey, struct pkey_protkey *protkey); +/* + * Verify the given secure key for being able to be useable with + * the pkey module. Check for correct key type and check for having at + * least one crypto card being able to handle this key (master key + * or old master key verification pattern matches). + * Return some info about the key: keysize in bits, keytype (currently + * only AES), flag if key is wrapped with an old MKVP. + * @param seckey pointer to buffer with the input secure key + * @param pcardnr pointer to cardnr, receives the card number on success + * @param pdomain pointer to domain, receives the domain number on success + * @param pkeysize pointer to keysize, receives the bitsize of the key + * @param pattributes pointer to attributes, receives additional info + * PKEY_VERIFY_ATTR_AES if the key is an AES key + * PKEY_VERIFY_ATTR_OLD_MKVP if key has old mkvp stored in + * @return 0 on success, negative errno value on failure. If no card could + * be found which is able to handle this key, -ENODEV is returned. + */ +int pkey_verifykey(const struct pkey_seckey *seckey, + u16 *pcardnr, u16 *pdomain, + u16 *pkeysize, u32 *pattributes); + #endif /* _KAPI_PKEY_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index e4988710aa86..60d395fdc864 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -91,14 +91,15 @@ extern void execve_tail(void); * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit. */ -#define TASK_SIZE_OF(tsk) ((tsk)->mm ? \ - (tsk)->mm->context.asce_limit : TASK_MAX_SIZE) +#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \ + (1UL << 31) : (1UL << 53)) #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ (1UL << 30) : (1UL << 41)) #define TASK_SIZE TASK_SIZE_OF(current) -#define TASK_MAX_SIZE (1UL << 53) +#define TASK_SIZE_MAX (1UL << 53) -#define STACK_TOP (1UL << (test_thread_flag(TIF_31BIT) ? 31:42)) +#define STACK_TOP (test_thread_flag(TIF_31BIT) ? \ + (1UL << 31) : (1UL << 42)) #define STACK_TOP_MAX (1UL << 42) #define HAVE_ARCH_PICK_MMAP_LAYOUT @@ -135,6 +136,8 @@ struct thread_struct { struct list_head list; /* cpu runtime instrumentation */ struct runtime_instr_cb *ri_cb; + struct gs_cb *gs_cb; /* Current guarded storage cb */ + struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */ unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ /* * Warning: 'fpu' is dynamically-sized. It *MUST* be at @@ -215,6 +218,9 @@ void show_cacheinfo(struct seq_file *m); /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); +/* Free guarded storage control block for current */ +void exit_thread_gs(void); + /* * Return saved PC of a blocked thread. */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 30bdb5a027f3..cd78155b1829 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -29,8 +29,8 @@ #define MACHINE_FLAG_TE _BITUL(11) #define MACHINE_FLAG_TLB_LC _BITUL(12) #define MACHINE_FLAG_VX _BITUL(13) -#define MACHINE_FLAG_CAD _BITUL(14) -#define MACHINE_FLAG_NX _BITUL(15) +#define MACHINE_FLAG_NX _BITUL(14) +#define MACHINE_FLAG_GS _BITUL(15) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -68,8 +68,8 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) -#define MACHINE_HAS_CAD (S390_lowcore.machine_flags & MACHINE_FLAG_CAD) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) +#define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) /* * Console mode. Override with conmode= diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h index 487428b6d099..334e279f1bce 100644 --- a/arch/s390/include/asm/sparsemem.h +++ b/arch/s390/include/asm/sparsemem.h @@ -2,6 +2,6 @@ #define _ASM_S390_SPARSEMEM_H #define SECTION_SIZE_BITS 28 -#define MAX_PHYSMEM_BITS 46 +#define MAX_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS #endif /* _ASM_S390_SPARSEMEM_H */ diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index ffc45048ea7d..f7838ecd83c6 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -10,6 +10,7 @@ #define __ASM_SPINLOCK_H #include <linux/smp.h> +#include <asm/atomic_ops.h> #include <asm/barrier.h> #include <asm/processor.h> @@ -17,12 +18,6 @@ extern int spin_retry; -static inline int -_raw_compare_and_swap(unsigned int *lock, unsigned int old, unsigned int new) -{ - return __sync_bool_compare_and_swap(lock, old, new); -} - #ifndef CONFIG_SMP static inline bool arch_vcpu_is_preempted(int cpu) { return false; } #else @@ -40,7 +35,7 @@ bool arch_vcpu_is_preempted(int cpu); * (the type definitions are in asm/spinlock_types.h) */ -void arch_lock_relax(unsigned int cpu); +void arch_lock_relax(int cpu); void arch_spin_lock_wait(arch_spinlock_t *); int arch_spin_trylock_retry(arch_spinlock_t *); @@ -70,7 +65,7 @@ static inline int arch_spin_trylock_once(arch_spinlock_t *lp) { barrier(); return likely(arch_spin_value_unlocked(*lp) && - _raw_compare_and_swap(&lp->lock, 0, SPINLOCK_LOCKVAL)); + __atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL)); } static inline void arch_spin_lock(arch_spinlock_t *lp) @@ -95,7 +90,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lp) static inline void arch_spin_unlock(arch_spinlock_t *lp) { - typecheck(unsigned int, lp->lock); + typecheck(int, lp->lock); asm volatile( "st %1,%0\n" : "+Q" (lp->lock) @@ -141,16 +136,16 @@ extern int _raw_write_trylock_retry(arch_rwlock_t *lp); static inline int arch_read_trylock_once(arch_rwlock_t *rw) { - unsigned int old = ACCESS_ONCE(rw->lock); - return likely((int) old >= 0 && - _raw_compare_and_swap(&rw->lock, old, old + 1)); + int old = ACCESS_ONCE(rw->lock); + return likely(old >= 0 && + __atomic_cmpxchg_bool(&rw->lock, old, old + 1)); } static inline int arch_write_trylock_once(arch_rwlock_t *rw) { - unsigned int old = ACCESS_ONCE(rw->lock); + int old = ACCESS_ONCE(rw->lock); return likely(old == 0 && - _raw_compare_and_swap(&rw->lock, 0, 0x80000000)); + __atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000)); } #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES @@ -161,9 +156,9 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw) #define __RAW_LOCK(ptr, op_val, op_string) \ ({ \ - unsigned int old_val; \ + int old_val; \ \ - typecheck(unsigned int *, ptr); \ + typecheck(int *, ptr); \ asm volatile( \ op_string " %0,%2,%1\n" \ "bcr 14,0\n" \ @@ -175,9 +170,9 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw) #define __RAW_UNLOCK(ptr, op_val, op_string) \ ({ \ - unsigned int old_val; \ + int old_val; \ \ - typecheck(unsigned int *, ptr); \ + typecheck(int *, ptr); \ asm volatile( \ op_string " %0,%2,%1\n" \ : "=d" (old_val), "+Q" (*ptr) \ @@ -187,14 +182,14 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw) }) extern void _raw_read_lock_wait(arch_rwlock_t *lp); -extern void _raw_write_lock_wait(arch_rwlock_t *lp, unsigned int prev); +extern void _raw_write_lock_wait(arch_rwlock_t *lp, int prev); static inline void arch_read_lock(arch_rwlock_t *rw) { - unsigned int old; + int old; old = __RAW_LOCK(&rw->lock, 1, __RAW_OP_ADD); - if ((int) old < 0) + if (old < 0) _raw_read_lock_wait(rw); } @@ -205,7 +200,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) static inline void arch_write_lock(arch_rwlock_t *rw) { - unsigned int old; + int old; old = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR); if (old != 0) @@ -232,11 +227,11 @@ static inline void arch_read_lock(arch_rwlock_t *rw) static inline void arch_read_unlock(arch_rwlock_t *rw) { - unsigned int old; + int old; do { old = ACCESS_ONCE(rw->lock); - } while (!_raw_compare_and_swap(&rw->lock, old, old - 1)); + } while (!__atomic_cmpxchg_bool(&rw->lock, old, old - 1)); } static inline void arch_write_lock(arch_rwlock_t *rw) @@ -248,7 +243,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw) static inline void arch_write_unlock(arch_rwlock_t *rw) { - typecheck(unsigned int, rw->lock); + typecheck(int, rw->lock); rw->owner = 0; asm volatile( diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index d84b6939237c..fe755eec275f 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -6,14 +6,14 @@ #endif typedef struct { - unsigned int lock; + int lock; } __attribute__ ((aligned (4))) arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { .lock = 0, } typedef struct { - unsigned int lock; - unsigned int owner; + int lock; + int owner; } arch_rwlock_t; #define __ARCH_RW_LOCK_UNLOCKED { 0 } diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index 12d45f0cfdd9..f6c2b5814ab0 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -10,6 +10,7 @@ #include <linux/thread_info.h> #include <asm/fpu/api.h> #include <asm/ptrace.h> +#include <asm/guarded_storage.h> extern struct task_struct *__switch_to(void *, void *); extern void update_cr_regs(struct task_struct *task); @@ -33,12 +34,14 @@ static inline void restore_access_regs(unsigned int *acrs) save_fpu_regs(); \ save_access_regs(&prev->thread.acrs[0]); \ save_ri_cb(prev->thread.ri_cb); \ + save_gs_cb(prev->thread.gs_cb); \ } \ if (next->mm) { \ update_cr_regs(next); \ set_cpu_flag(CIF_FPU); \ restore_access_regs(&next->thread.acrs[0]); \ restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ + restore_gs_cb(next->thread.gs_cb); \ } \ prev = __switch_to(prev,next); \ } while (0) diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 229326c942c7..73bff45ced55 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -142,7 +142,15 @@ struct sysinfo_3_2_2 { extern int topology_max_mnest; -#define TOPOLOGY_CORE_BITS 64 +/* + * Returns the maximum nesting level supported by the cpu topology code. + * The current maximum level is 4 which is the drawer level. + */ +static inline int topology_mnest_limit(void) +{ + return min(topology_max_mnest, 4); +} + #define TOPOLOGY_NR_MAG 6 struct topology_core { @@ -152,7 +160,7 @@ struct topology_core { unsigned char pp:2; unsigned char reserved1; unsigned short origin; - unsigned long mask[TOPOLOGY_CORE_BITS / BITS_PER_LONG]; + unsigned long mask; }; struct topology_container { diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index a5b54a445eb8..f36e6e2b73f0 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -54,11 +54,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_SYSCALL_TRACE 3 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ -#define TIF_SECCOMP 5 /* secure computing */ -#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_UPROBE 7 /* breakpointed or single-stepping */ +#define TIF_UPROBE 3 /* breakpointed or single-stepping */ +#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ +#define TIF_SYSCALL_TRACE 8 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ +#define TIF_SECCOMP 10 /* secure computing */ +#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ #define TIF_31BIT 16 /* 32bit process */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ @@ -76,5 +77,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define _TIF_UPROBE _BITUL(TIF_UPROBE) #define _TIF_31BIT _BITUL(TIF_31BIT) #define _TIF_SINGLE_STEP _BITUL(TIF_SINGLE_STEP) +#define _TIF_GUARDED_STORAGE _BITUL(TIF_GUARDED_STORAGE) #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index 6848ba5c1454..addb09cee0f5 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -1,6 +1,16 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm +generic-y += errno.h +generic-y += fcntl.h +generic-y += ioctl.h +generic-y += mman.h +generic-y += param.h +generic-y += poll.h +generic-y += resource.h +generic-y += sockios.h +generic-y += termbits.h + header-y += auxvec.h header-y += bitsperlong.h header-y += byteorder.h @@ -11,25 +21,20 @@ header-y += cmb.h header-y += dasd.h header-y += debug.h header-y += errno.h -header-y += fcntl.h +header-y += guarded_storage.h header-y += hypfs.h -header-y += ioctl.h header-y += ioctls.h header-y += ipcbuf.h header-y += kvm.h header-y += kvm_para.h header-y += kvm_perf.h header-y += kvm_virtio.h -header-y += mman.h header-y += monwriter.h header-y += msgbuf.h -header-y += param.h header-y += pkey.h -header-y += poll.h header-y += posix_types.h header-y += ptrace.h header-y += qeth.h -header-y += resource.h header-y += schid.h header-y += sclp_ctl.h header-y += sembuf.h @@ -40,12 +45,10 @@ header-y += sigcontext.h header-y += siginfo.h header-y += signal.h header-y += socket.h -header-y += sockios.h header-y += stat.h header-y += statfs.h header-y += swab.h header-y += tape390.h -header-y += termbits.h header-y += termios.h header-y += types.h header-y += ucontext.h diff --git a/arch/s390/include/uapi/asm/errno.h b/arch/s390/include/uapi/asm/errno.h deleted file mode 100644 index 395e97d8005e..000000000000 --- a/arch/s390/include/uapi/asm/errno.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - */ - -#ifndef _S390_ERRNO_H -#define _S390_ERRNO_H - -#include <asm-generic/errno.h> - -#endif diff --git a/arch/s390/include/uapi/asm/fcntl.h b/arch/s390/include/uapi/asm/fcntl.h deleted file mode 100644 index 46ab12db5739..000000000000 --- a/arch/s390/include/uapi/asm/fcntl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/fcntl.h> diff --git a/arch/s390/include/uapi/asm/guarded_storage.h b/arch/s390/include/uapi/asm/guarded_storage.h new file mode 100644 index 000000000000..852850e8e17e --- /dev/null +++ b/arch/s390/include/uapi/asm/guarded_storage.h @@ -0,0 +1,77 @@ +#ifndef _GUARDED_STORAGE_H +#define _GUARDED_STORAGE_H + +#include <linux/types.h> + +struct gs_cb { + __u64 reserved; + __u64 gsd; + __u64 gssm; + __u64 gs_epl_a; +}; + +struct gs_epl { + __u8 pad1; + union { + __u8 gs_eam; + struct { + __u8 : 6; + __u8 e : 1; + __u8 b : 1; + }; + }; + union { + __u8 gs_eci; + struct { + __u8 tx : 1; + __u8 cx : 1; + __u8 : 5; + __u8 in : 1; + }; + }; + union { + __u8 gs_eai; + struct { + __u8 : 1; + __u8 t : 1; + __u8 as : 2; + __u8 ar : 4; + }; + }; + __u32 pad2; + __u64 gs_eha; + __u64 gs_eia; + __u64 gs_eoa; + __u64 gs_eir; + __u64 gs_era; +}; + +#define GS_ENABLE 0 +#define GS_DISABLE 1 +#define GS_SET_BC_CB 2 +#define GS_CLEAR_BC_CB 3 +#define GS_BROADCAST 4 + +static inline void load_gs_cb(struct gs_cb *gs_cb) +{ + asm volatile(".insn rxy,0xe3000000004d,0,%0" : : "Q" (*gs_cb)); +} + +static inline void store_gs_cb(struct gs_cb *gs_cb) +{ + asm volatile(".insn rxy,0xe30000000049,0,%0" : : "Q" (*gs_cb)); +} + +static inline void save_gs_cb(struct gs_cb *gs_cb) +{ + if (gs_cb) + store_gs_cb(gs_cb); +} + +static inline void restore_gs_cb(struct gs_cb *gs_cb) +{ + if (gs_cb) + load_gs_cb(gs_cb); +} + +#endif /* _GUARDED_STORAGE_H */ diff --git a/arch/s390/include/uapi/asm/ioctl.h b/arch/s390/include/uapi/asm/ioctl.h deleted file mode 100644 index b279fe06dfe5..000000000000 --- a/arch/s390/include/uapi/asm/ioctl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctl.h> diff --git a/arch/s390/include/uapi/asm/mman.h b/arch/s390/include/uapi/asm/mman.h deleted file mode 100644 index de23da1f41b2..000000000000 --- a/arch/s390/include/uapi/asm/mman.h +++ /dev/null @@ -1,6 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/mman.h" - */ -#include <asm-generic/mman.h> diff --git a/arch/s390/include/uapi/asm/param.h b/arch/s390/include/uapi/asm/param.h deleted file mode 100644 index c616821bf2ac..000000000000 --- a/arch/s390/include/uapi/asm/param.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASMS390_PARAM_H -#define _ASMS390_PARAM_H - -#include <asm-generic/param.h> - -#endif /* _ASMS390_PARAM_H */ diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h index ed7f19c27ce5..e6c04faf8a6c 100644 --- a/arch/s390/include/uapi/asm/pkey.h +++ b/arch/s390/include/uapi/asm/pkey.h @@ -109,4 +109,23 @@ struct pkey_skey2pkey { }; #define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey) +/* + * Verify the given secure key for being able to be useable with + * the pkey module. Check for correct key type and check for having at + * least one crypto card being able to handle this key (master key + * or old master key verification pattern matches). + * Return some info about the key: keysize in bits, keytype (currently + * only AES), flag if key is wrapped with an old MKVP. + */ +struct pkey_verifykey { + struct pkey_seckey seckey; /* in: the secure key blob */ + __u16 cardnr; /* out: card number */ + __u16 domain; /* out: domain number */ + __u16 keysize; /* out: key size in bits */ + __u32 attributes; /* out: attribute bits */ +}; +#define PKEY_VERIFYKEY _IOWR(PKEY_IOCTL_MAGIC, 0x07, struct pkey_verifykey) +#define PKEY_VERIFY_ATTR_AES 0x00000001 /* key is an AES key */ +#define PKEY_VERIFY_ATTR_OLD_MKVP 0x00000100 /* key has old MKVP value */ + #endif /* _UAPI_PKEY_H */ diff --git a/arch/s390/include/uapi/asm/poll.h b/arch/s390/include/uapi/asm/poll.h deleted file mode 100644 index c98509d3149e..000000000000 --- a/arch/s390/include/uapi/asm/poll.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/poll.h> diff --git a/arch/s390/include/uapi/asm/resource.h b/arch/s390/include/uapi/asm/resource.h deleted file mode 100644 index ec23d1c73c92..000000000000 --- a/arch/s390/include/uapi/asm/resource.h +++ /dev/null @@ -1,13 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/resources.h" - */ - -#ifndef _S390_RESOURCE_H -#define _S390_RESOURCE_H - -#include <asm-generic/resource.h> - -#endif - diff --git a/arch/s390/include/uapi/asm/sockios.h b/arch/s390/include/uapi/asm/sockios.h deleted file mode 100644 index 6f60eee73242..000000000000 --- a/arch/s390/include/uapi/asm/sockios.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_S390_SOCKIOS_H -#define _ASM_S390_SOCKIOS_H - -#include <asm-generic/sockios.h> - -#endif diff --git a/arch/s390/include/uapi/asm/termbits.h b/arch/s390/include/uapi/asm/termbits.h deleted file mode 100644 index 71bf6ac6a2b9..000000000000 --- a/arch/s390/include/uapi/asm/termbits.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_S390_TERMBITS_H -#define _ASM_S390_TERMBITS_H - -#include <asm-generic/termbits.h> - -#endif diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 152de9b796e1..ea42290e7d51 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -313,7 +313,7 @@ #define __NR_copy_file_range 375 #define __NR_preadv2 376 #define __NR_pwritev2 377 -/* Number 378 is reserved for guarded storage */ +#define __NR_s390_guarded_storage 378 #define __NR_statx 379 #define NR_syscalls 380 diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 060ce548fe8b..adb3fe2e3d42 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -51,14 +51,12 @@ CFLAGS_dumpstack.o += -fno-optimize-sibling-calls # CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' -CFLAGS_sysinfo.o += -w - obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o -obj-y += runtime_instr.o cache.o fpu.o dumpstack.o -obj-y += entry.o reipl.o relocate_kernel.o +obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o +obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o extra-y += head.o head64.o vmlinux.lds diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index c4b3570ded5b..6bb29633e1f1 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -175,7 +175,7 @@ int main(void) /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); /* hardware defined lowcore locations 0x1000 - 0x18ff */ - OFFSET(__LC_VX_SAVE_AREA_ADDR, lowcore, vector_save_area_addr); + OFFSET(__LC_MCESAD, lowcore, mcesad); OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2); OFFSET(__LC_FPREGS_SAVE_AREA, lowcore, floating_pt_save_area); OFFSET(__LC_GPREGS_SAVE_AREA, lowcore, gpregs_save_area); diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index e89cc2e71db1..986642a3543b 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -178,4 +178,5 @@ COMPAT_SYSCALL_WRAP3(getpeername, int, fd, struct sockaddr __user *, usockaddr, COMPAT_SYSCALL_WRAP6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len); COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags); COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb); COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index dd1d5c62c374..d628afc26708 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -429,6 +429,20 @@ static void *nt_vmcoreinfo(void *ptr) } /* + * Initialize final note (needed for /proc/vmcore code) + */ +static void *nt_final(void *ptr) +{ + Elf64_Nhdr *note; + + note = (Elf64_Nhdr *) ptr; + note->n_namesz = 0; + note->n_descsz = 0; + note->n_type = 0; + return PTR_ADD(ptr, sizeof(Elf64_Nhdr)); +} + +/* * Initialize ELF header (new kernel) */ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) @@ -515,6 +529,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) if (sa->prefix != 0) ptr = fill_cpu_elf_notes(ptr, cpu++, sa); ptr = nt_vmcoreinfo(ptr); + ptr = nt_final(ptr); memset(phdr, 0, sizeof(*phdr)); phdr->p_type = PT_NOTE; phdr->p_offset = notes_offset; diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 4e65c79cc5f2..5d20182ee8ae 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -231,9 +231,29 @@ static noinline __init void detect_machine_type(void) S390_lowcore.machine_flags |= MACHINE_FLAG_VM; } +/* Remove leading, trailing and double whitespace. */ +static inline void strim_all(char *str) +{ + char *s; + + s = strim(str); + if (s != str) + memmove(str, s, strlen(s)); + while (*str) { + if (!isspace(*str++)) + continue; + if (isspace(*str)) { + s = skip_spaces(str); + memmove(str, s, strlen(s) + 1); + } + } +} + static noinline __init void setup_arch_string(void) { struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page; + struct sysinfo_3_2_2 *vm = (struct sysinfo_3_2_2 *)&sysinfo_page; + char mstr[80], hvstr[17]; if (stsi(mach, 1, 1, 1)) return; @@ -241,14 +261,21 @@ static noinline __init void setup_arch_string(void) EBCASC(mach->type, sizeof(mach->type)); EBCASC(mach->model, sizeof(mach->model)); EBCASC(mach->model_capacity, sizeof(mach->model_capacity)); - dump_stack_set_arch_desc("%-16.16s %-4.4s %-16.16s %-16.16s (%s)", - mach->manufacturer, - mach->type, - mach->model, - mach->model_capacity, - MACHINE_IS_LPAR ? "LPAR" : - MACHINE_IS_VM ? "z/VM" : - MACHINE_IS_KVM ? "KVM" : "unknown"); + sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s", + mach->manufacturer, mach->type, + mach->model, mach->model_capacity); + strim_all(mstr); + if (stsi(vm, 3, 2, 2) == 0 && vm->count) { + EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi)); + sprintf(hvstr, "%-16.16s", vm->vm[0].cpi); + strim_all(hvstr); + } else { + sprintf(hvstr, "%s", + MACHINE_IS_LPAR ? "LPAR" : + MACHINE_IS_VM ? "z/VM" : + MACHINE_IS_KVM ? "KVM" : "unknown"); + } + dump_stack_set_arch_desc("%s (%s)", mstr, hvstr); } static __init void setup_topology(void) @@ -358,6 +385,8 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_NX; __ctl_set_bit(0, 20); } + if (test_facility(133)) + S390_lowcore.machine_flags |= MACHINE_FLAG_GS; } static inline void save_vector_registers(void) @@ -375,7 +404,7 @@ static int __init topology_setup(char *str) rc = kstrtobool(str, &enabled); if (!rc && !enabled) - S390_lowcore.machine_flags &= ~MACHINE_HAS_TOPOLOGY; + S390_lowcore.machine_flags &= ~MACHINE_FLAG_TOPOLOGY; return rc; } early_param("topology", topology_setup); @@ -405,23 +434,16 @@ early_param("noexec", noexec_setup); static int __init cad_setup(char *str) { - int val; - - get_option(&str, &val); - if (val && test_facility(128)) - S390_lowcore.machine_flags |= MACHINE_FLAG_CAD; - return 0; -} -early_param("cad", cad_setup); + bool enabled; + int rc; -static int __init cad_init(void) -{ - if (MACHINE_HAS_CAD) + rc = kstrtobool(str, &enabled); + if (!rc && enabled && test_facility(128)) /* Enable problem state CAD. */ __ctl_set_bit(2, 3); - return 0; + return rc; } -early_initcall(cad_init); +early_param("cad", cad_setup); static __init void memmove_early(void *dst, const void *src, size_t n) { diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 6a7d737d514c..c6cf338c9327 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -47,7 +47,7 @@ STACK_SIZE = 1 << STACK_SHIFT STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE _TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ - _TIF_UPROBE) + _TIF_UPROBE | _TIF_GUARDED_STORAGE) _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) _CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ @@ -189,8 +189,6 @@ ENTRY(__switch_to) stg %r3,__LC_CURRENT # store task struct of next stg %r15,__LC_KERNEL_STACK # store end of kernel stack lg %r15,__THREAD_ksp(%r1) # load kernel stack of next - /* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */ - lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 mvc __LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPP @@ -332,6 +330,8 @@ ENTRY(system_call) TSTMSK __TI_flags(%r12),_TIF_UPROBE jo .Lsysc_uprobe_notify #endif + TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE + jo .Lsysc_guarded_storage TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP jo .Lsysc_singlestep TSTMSK __TI_flags(%r12),_TIF_SIGPENDING @@ -409,6 +409,14 @@ ENTRY(system_call) #endif # +# _TIF_GUARDED_STORAGE is set, call guarded_storage_load +# +.Lsysc_guarded_storage: + lgr %r2,%r11 # pass pointer to pt_regs + larl %r14,.Lsysc_return + jg gs_load_bc_cb + +# # _PIF_PER_TRAP is set, call do_per_trap # .Lsysc_singlestep: @@ -663,6 +671,8 @@ ENTRY(io_int_handler) jo .Lio_sigpending TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME jo .Lio_notify_resume + TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE + jo .Lio_guarded_storage TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lio_vxrs TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) @@ -697,6 +707,18 @@ ENTRY(io_int_handler) jg load_fpu_regs # +# _TIF_GUARDED_STORAGE is set, call guarded_storage_load +# +.Lio_guarded_storage: + # TRACE_IRQS_ON already done at .Lio_return + ssm __LC_SVC_NEW_PSW # reenable interrupts + lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,gs_load_bc_cb + ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts + TRACE_IRQS_OFF + j .Lio_return + +# # _TIF_NEED_RESCHED is set, call schedule # .Lio_reschedule: diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 33f901865326..dbf5f7e18246 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -74,12 +74,14 @@ long sys_sigreturn(void); long sys_s390_personality(unsigned int personality); long sys_s390_runtime_instr(int command, int signum); +long sys_s390_guarded_storage(int command, struct gs_cb __user *); long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); DECLARE_PER_CPU(u64, mt_cycles[8]); void verify_facilities(void); +void gs_load_bc_cb(struct pt_regs *regs); void set_fs_fixup(void); #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c new file mode 100644 index 000000000000..6f064745c3b1 --- /dev/null +++ b/arch/s390/kernel/guarded_storage.c @@ -0,0 +1,128 @@ +/* + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/signal.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <asm/guarded_storage.h> +#include "entry.h" + +void exit_thread_gs(void) +{ + kfree(current->thread.gs_cb); + kfree(current->thread.gs_bc_cb); + current->thread.gs_cb = current->thread.gs_bc_cb = NULL; +} + +static int gs_enable(void) +{ + struct gs_cb *gs_cb; + + if (!current->thread.gs_cb) { + gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + if (!gs_cb) + return -ENOMEM; + gs_cb->gsd = 25; + preempt_disable(); + __ctl_set_bit(2, 4); + load_gs_cb(gs_cb); + current->thread.gs_cb = gs_cb; + preempt_enable(); + } + return 0; +} + +static int gs_disable(void) +{ + if (current->thread.gs_cb) { + preempt_disable(); + kfree(current->thread.gs_cb); + current->thread.gs_cb = NULL; + __ctl_clear_bit(2, 4); + preempt_enable(); + } + return 0; +} + +static int gs_set_bc_cb(struct gs_cb __user *u_gs_cb) +{ + struct gs_cb *gs_cb; + + gs_cb = current->thread.gs_bc_cb; + if (!gs_cb) { + gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL); + if (!gs_cb) + return -ENOMEM; + current->thread.gs_bc_cb = gs_cb; + } + if (copy_from_user(gs_cb, u_gs_cb, sizeof(*gs_cb))) + return -EFAULT; + return 0; +} + +static int gs_clear_bc_cb(void) +{ + struct gs_cb *gs_cb; + + gs_cb = current->thread.gs_bc_cb; + current->thread.gs_bc_cb = NULL; + kfree(gs_cb); + return 0; +} + +void gs_load_bc_cb(struct pt_regs *regs) +{ + struct gs_cb *gs_cb; + + preempt_disable(); + clear_thread_flag(TIF_GUARDED_STORAGE); + gs_cb = current->thread.gs_bc_cb; + if (gs_cb) { + kfree(current->thread.gs_cb); + current->thread.gs_bc_cb = NULL; + __ctl_set_bit(2, 4); + load_gs_cb(gs_cb); + current->thread.gs_cb = gs_cb; + } + preempt_enable(); +} + +static int gs_broadcast(void) +{ + struct task_struct *sibling; + + read_lock(&tasklist_lock); + for_each_thread(current, sibling) { + if (!sibling->thread.gs_bc_cb) + continue; + if (test_and_set_tsk_thread_flag(sibling, TIF_GUARDED_STORAGE)) + kick_process(sibling); + } + read_unlock(&tasklist_lock); + return 0; +} + +SYSCALL_DEFINE2(s390_guarded_storage, int, command, + struct gs_cb __user *, gs_cb) +{ + if (!MACHINE_HAS_GS) + return -EOPNOTSUPP; + switch (command) { + case GS_ENABLE: + return gs_enable(); + case GS_DISABLE: + return gs_disable(); + case GS_SET_BC_CB: + return gs_set_bc_cb(gs_cb); + case GS_CLEAR_BC_CB: + return gs_clear_bc_cb(); + case GS_BROADCAST: + return gs_broadcast(); + default: + return -EINVAL; + } +} diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index 0b5ebf8a3d30..eff5b31671d4 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -25,7 +25,6 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> -#include <asm/facility.h> #include <asm/page.h> #include <asm/ptrace.h> diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 482d3526e32b..31c91f24e562 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -52,7 +52,7 @@ ENTRY(startup_continue) .quad 0 # cr1: primary space segment table .quad .Lduct # cr2: dispatchable unit control table .quad 0 # cr3: instruction authorization - .quad 0 # cr4: instruction authorization + .quad 0xffff # cr4: instruction authorization .quad .Lduct # cr5: primary-aste origin .quad 0 # cr6: I/O interrupts .quad 0 # cr7: secondary space segment table diff --git a/arch/s390/kernel/kdebugfs.c b/arch/s390/kernel/kdebugfs.c new file mode 100644 index 000000000000..ee85e17dd79d --- /dev/null +++ b/arch/s390/kernel/kdebugfs.c @@ -0,0 +1,15 @@ +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/init.h> + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("s390", NULL); + if (IS_ERR(arch_debugfs_dir)) + arch_debugfs_dir = NULL; + return 0; +} +postcore_initcall(arch_kdebugfs_init); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 3074c1d83829..db5658daf994 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -27,6 +27,7 @@ #include <asm/cacheflush.h> #include <asm/os_info.h> #include <asm/switch_to.h> +#include <asm/nmi.h> typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); @@ -102,6 +103,8 @@ static void __do_machine_kdump(void *image) */ static noinline void __machine_kdump(void *image) { + struct mcesa *mcesa; + unsigned long cr2_old, cr2_new; int this_cpu, cpu; lgr_info_log(); @@ -114,8 +117,16 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ + mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); if (MACHINE_HAS_VX) - save_vx_regs((void *) &S390_lowcore.vector_save_area); + save_vx_regs((__vector128 *) mcesa->vector_save_area); + if (MACHINE_HAS_GS) { + __ctl_store(cr2_old, 2, 2); + cr2_new = cr2_old | (1UL << 4); + __ctl_load(cr2_new, 2, 2); + save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); + __ctl_load(cr2_old, 2, 2); + } /* * To create a good backchain for this CPU in the dump store_status * is passed the address of a function. The address is saved into diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 9bf8327154ee..985589523970 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -106,6 +106,7 @@ static int notrace s390_validate_registers(union mci mci, int umode) int kill_task; u64 zero; void *fpt_save_area; + struct mcesa *mcesa; kill_task = 0; zero = 0; @@ -165,6 +166,7 @@ static int notrace s390_validate_registers(union mci mci, int umode) : : "Q" (S390_lowcore.fpt_creg_save_area)); } + mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); if (!MACHINE_HAS_VX) { /* Validate floating point registers */ asm volatile( @@ -209,8 +211,8 @@ static int notrace s390_validate_registers(union mci mci, int umode) " la 1,%0\n" " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ - : : "Q" (*(struct vx_array *) - &S390_lowcore.vector_save_area) : "1"); + : : "Q" (*(struct vx_array *) mcesa->vector_save_area) + : "1"); __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); } /* Validate access registers */ @@ -224,6 +226,19 @@ static int notrace s390_validate_registers(union mci mci, int umode) */ kill_task = 1; } + /* Validate guarded storage registers */ + if (MACHINE_HAS_GS && (S390_lowcore.cregs_save_area[2] & (1UL << 4))) { + if (!mci.gs) + /* + * Guarded storage register can't be restored and + * the current processes uses guarded storage. + * It has to be terminated. + */ + kill_task = 1; + else + load_gs_cb((struct gs_cb *) + mcesa->guarded_storage_save_area); + } /* * We don't even try to validate the TOD register, since we simply * can't write something sensible into that register. diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 1aba10e90906..746d03423333 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -1,7 +1,7 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012, 2017 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> * * This program is free software; you can redistribute it and/or modify @@ -22,19 +22,12 @@ #include <asm/irq.h> #include <asm/cpu_mf.h> -/* CPU-measurement counter facility supports these CPU counter sets: - * For CPU counter sets: - * Basic counter set: 0-31 - * Problem-state counter set: 32-63 - * Crypto-activity counter set: 64-127 - * Extented counter set: 128-159 - */ enum cpumf_ctr_set { - /* CPU counter sets */ - CPUMF_CTR_SET_BASIC = 0, - CPUMF_CTR_SET_USER = 1, - CPUMF_CTR_SET_CRYPTO = 2, - CPUMF_CTR_SET_EXT = 3, + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ /* Maximum number of counter sets */ CPUMF_CTR_SET_MAX, @@ -47,6 +40,7 @@ static const u64 cpumf_state_ctl[CPUMF_CTR_SET_MAX] = { [CPUMF_CTR_SET_USER] = 0x04, [CPUMF_CTR_SET_CRYPTO] = 0x08, [CPUMF_CTR_SET_EXT] = 0x01, + [CPUMF_CTR_SET_MT_DIAG] = 0x20, }; static void ctr_set_enable(u64 *state, int ctr_set) @@ -76,19 +70,20 @@ struct cpu_hw_events { }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .ctr_set = { - [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0), }, .state = 0, .flags = 0, .txn_flags = 0, }; -static int get_counter_set(u64 event) +static enum cpumf_ctr_set get_counter_set(u64 event) { - int set = -1; + int set = CPUMF_CTR_SET_MAX; if (event < 32) set = CPUMF_CTR_SET_BASIC; @@ -98,34 +93,17 @@ static int get_counter_set(u64 event) set = CPUMF_CTR_SET_CRYPTO; else if (event < 256) set = CPUMF_CTR_SET_EXT; + else if (event >= 448 && event < 496) + set = CPUMF_CTR_SET_MT_DIAG; return set; } -static int validate_event(const struct hw_perf_event *hwc) -{ - switch (hwc->config_base) { - case CPUMF_CTR_SET_BASIC: - case CPUMF_CTR_SET_USER: - case CPUMF_CTR_SET_CRYPTO: - case CPUMF_CTR_SET_EXT: - /* check for reserved counters */ - if ((hwc->config >= 6 && hwc->config <= 31) || - (hwc->config >= 38 && hwc->config <= 63) || - (hwc->config >= 80 && hwc->config <= 127)) - return -EOPNOTSUPP; - break; - default: - return -EINVAL; - } - - return 0; -} - static int validate_ctr_version(const struct hw_perf_event *hwc) { struct cpu_hw_events *cpuhw; int err = 0; + u16 mtdiag_ctl; cpuhw = &get_cpu_var(cpu_hw_events); @@ -145,6 +123,27 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) (cpuhw->info.csvn > 2 && hwc->config > 255)) err = -EOPNOTSUPP; break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpuhw->info.csvn <= 3) + err = -EOPNOTSUPP; + /* + * MT-diagnostic counters are read-only. The counter set + * is automatically enabled and activated on all CPUs with + * multithreading (SMT). Deactivation of multithreading + * also disables the counter set. State changes are ignored + * by lcctl(). Because Linux controls SMT enablement through + * a kernel parameter only, the counter set is either disabled + * or enabled and active. + * + * Thus, the counters can only be used if SMT is on and the + * counter set is enabled and active. + */ + mtdiag_ctl = cpumf_state_ctl[CPUMF_CTR_SET_MT_DIAG]; + if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && + (cpuhw->info.enable_ctl & mtdiag_ctl) && + (cpuhw->info.act_ctl & mtdiag_ctl))) + err = -EOPNOTSUPP; + break; } put_cpu_var(cpu_hw_events); @@ -250,6 +249,11 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* loss of counter data alert */ if (alert & CPU_MF_INT_CF_LCDA) pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); } #define PMC_INIT 0 @@ -330,6 +334,7 @@ static int __hw_perf_event_init(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; + enum cpumf_ctr_set set; int err; u64 ev; @@ -370,25 +375,30 @@ static int __hw_perf_event_init(struct perf_event *event) if (ev == -1) return -ENOENT; - if (ev >= PERF_CPUM_CF_MAX_CTR) + if (ev > PERF_CPUM_CF_MAX_CTR) return -EINVAL; - /* Use the hardware perf event structure to store the counter number - * in 'config' member and the counter set to which the counter belongs - * in the 'config_base'. The counter set (config_base) is then used - * to enable/disable the counters. - */ - hwc->config = ev; - hwc->config_base = get_counter_set(ev); - - /* Validate the counter that is assigned to this event. - * Because the counter facility can use numerous counters at the - * same time without constraints, it is not necessary to explicitly - * validate event groups (event->group_leader != event). - */ - err = validate_event(hwc); - if (err) - return err; + /* Obtain the counter set to which the specified counter belongs */ + set = get_counter_set(ev); + switch (set) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + case CPUMF_CTR_SET_CRYPTO: + case CPUMF_CTR_SET_EXT: + case CPUMF_CTR_SET_MT_DIAG: + /* + * Use the hardware perf event structure to store the + * counter number in the 'config' member and the counter + * set number in the 'config_base'. The counter set number + * is then later used to enable/disable the counter(s). + */ + hwc->config = ev; + hwc->config_base = set; + break; + case CPUMF_CTR_SET_MAX: + /* The counter could not be associated to a counter set */ + return -EINVAL; + }; /* Initialize for using the CPU-measurement counter facility */ if (!atomic_inc_not_zero(&num_events)) { @@ -452,7 +462,7 @@ static int hw_perf_event_reset(struct perf_event *event) return err; } -static int hw_perf_event_update(struct perf_event *event) +static void hw_perf_event_update(struct perf_event *event) { u64 prev, new, delta; int err; @@ -461,14 +471,12 @@ static int hw_perf_event_update(struct perf_event *event) prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) - goto out; + return; } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1; /* overflow */ local64_add(delta, &event->count); -out: - return err; } static void cpumf_pmu_read(struct perf_event *event) diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index c343ac2cf6c5..d3133285b7d1 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -114,8 +114,64 @@ CPUMF_EVENT_ATTR(cf_zec12, L1I_OFFBOOK_L3_SOURCED_WRITES_IV, 0x00a1); CPUMF_EVENT_ATTR(cf_zec12, TX_NC_TABORT, 0x00b1); CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_NO_SPECIAL, 0x00b2); CPUMF_EVENT_ATTR(cf_zec12, TX_C_TABORT_SPECIAL, 0x00b3); +CPUMF_EVENT_ATTR(cf_z13, L1D_WRITES_RO_EXCL, 0x0080); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_HPAGE_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z13, DTLB1_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z13, L1D_L2D_SOURCED_WRITES, 0x0085); +CPUMF_EVENT_ATTR(cf_z13, ITLB1_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z13, ITLB1_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z13, L1I_L2I_SOURCED_WRITES, 0x0088); +CPUMF_EVENT_ATTR(cf_z13, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z13, TLB2_CRSTE_WRITES, 0x008b); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z13, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z13, L1C_TLB1_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0091); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES, 0x0092); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV, 0x0093); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES, 0x0094); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x0095); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES, 0x0097); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x0098); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x0099); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x009a); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x009b); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x009c); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x009d); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES, 0x009e); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES, 0x009f); +CPUMF_EVENT_ATTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES, 0x00a0); +CPUMF_EVENT_ATTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES, 0x00a1); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES, 0x00a4); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV, 0x00a5); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES, 0x00a6); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00a7); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV, 0x00a8); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES, 0x00a9); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES, 0x00aa); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV, 0x00ab); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES, 0x00ac); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES, 0x00ad); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES, 0x00af); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES, 0x00b0); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES, 0x00b1); +CPUMF_EVENT_ATTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES, 0x00b2); +CPUMF_EVENT_ATTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES, 0x00b3); +CPUMF_EVENT_ATTR(cf_z13, TX_NC_TABORT, 0x00da); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_NO_SPECIAL, 0x00db); +CPUMF_EVENT_ATTR(cf_z13, TX_C_TABORT_SPECIAL, 0x00dc); +CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); -static struct attribute *cpumcf_pmu_event_attr[] = { +static struct attribute *cpumcf_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf, CPU_CYCLES), CPUMF_EVENT_PTR(cf, INSTRUCTIONS), CPUMF_EVENT_PTR(cf, L1I_DIR_WRITES), @@ -236,28 +292,87 @@ static struct attribute *cpumcf_zec12_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z13_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z13, L1D_WRITES_RO_EXCL), + CPUMF_EVENT_PTR(cf_z13, DTLB1_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, DTLB1_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_L2D_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, ITLB1_WRITES), + CPUMF_EVENT_PTR(cf_z13, ITLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, L1I_L2I_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_HPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z13, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z13, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z13, L1C_TLB1_MISSES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_SCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_FCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONNODE_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_OFFDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1D_ONCHIP_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_SCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L4_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES_IV), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_FCOL_L3_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONNODE_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_OFFDRAWER_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, L1I_ONCHIP_MEM_SOURCED_WRITES), + CPUMF_EVENT_PTR(cf_z13, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z13, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z13, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ -static struct attribute_group cpumsf_pmu_events_group = { +static struct attribute_group cpumcf_pmu_events_group = { .name = "events", - .attrs = cpumcf_pmu_event_attr, }; PMU_FORMAT_ATTR(event, "config:0-63"); -static struct attribute *cpumsf_pmu_format_attr[] = { +static struct attribute *cpumcf_pmu_format_attr[] = { &format_attr_event.attr, NULL, }; -static struct attribute_group cpumsf_pmu_format_group = { +static struct attribute_group cpumcf_pmu_format_group = { .name = "format", - .attrs = cpumsf_pmu_format_attr, + .attrs = cpumcf_pmu_format_attr, }; -static const struct attribute_group *cpumsf_pmu_attr_groups[] = { - &cpumsf_pmu_events_group, - &cpumsf_pmu_format_group, +static const struct attribute_group *cpumcf_pmu_attr_groups[] = { + &cpumcf_pmu_events_group, + &cpumcf_pmu_format_group, NULL, }; @@ -290,6 +405,7 @@ static __init struct attribute **merge_attr(struct attribute **a, __init const struct attribute_group **cpumf_cf_event_group(void) { struct attribute **combined, **model; + struct attribute *none[] = { NULL }; struct cpuid cpu_id; get_cpu_id(&cpu_id); @@ -306,17 +422,17 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x2828: model = cpumcf_zec12_pmu_event_attr; break; + case 0x2964: + case 0x2965: + model = cpumcf_z13_pmu_event_attr; + break; default: - model = NULL; + model = none; break; } - if (!model) - goto out; - combined = merge_attr(cpumcf_pmu_event_attr, model); if (combined) - cpumsf_pmu_events_group.attrs = combined; -out: - return cpumsf_pmu_attr_groups; + cpumcf_pmu_events_group.attrs = combined; + return cpumcf_pmu_attr_groups; } diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 1c0b58545c04..9a4f279d25ca 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1009,8 +1009,8 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) * sample. Some early samples or samples from guests without * lpp usage would be misaccounted to the host. We use the asn * value as an addon heuristic to detect most of these guest samples. - * If the value differs from the host hpp value, we assume to be a - * KVM guest. + * If the value differs from 0xffff (the host value), we assume to + * be a KVM guest. */ switch (sfr->basic.CL) { case 1: /* logical partition */ @@ -1020,8 +1020,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) sde_regs->in_guest = 1; break; default: /* old machine, use heuristics */ - if (sfr->basic.gpp || - sfr->basic.prim_asn != (u16)sfr->basic.hpp) + if (sfr->basic.gpp || sfr->basic.prim_asn != 0xffff) sde_regs->in_guest = 1; break; } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index f29e41c5e2ec..999d7154bbdc 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -73,8 +73,10 @@ extern void kernel_thread_starter(void); */ void exit_thread(struct task_struct *tsk) { - if (tsk == current) + if (tsk == current) { exit_thread_runtime_instr(); + exit_thread_gs(); + } } void flush_thread(void) @@ -159,6 +161,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, /* Don't copy runtime instrumentation info */ p->thread.ri_cb = NULL; frame->childregs.psw.mask &= ~PSW_MASK_RI; + /* Don't copy guarded storage control block */ + p->thread.gs_cb = NULL; + p->thread.gs_bc_cb = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 928b929a6261..778cd6536175 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/cpufeature.h> +#include <linux/bitops.h> #include <linux/kernel.h> #include <linux/sched/mm.h> #include <linux/init.h> @@ -91,11 +92,23 @@ int cpu_have_feature(unsigned int num) } EXPORT_SYMBOL(cpu_have_feature); +static void show_facilities(struct seq_file *m) +{ + unsigned int bit; + long *facilities; + + facilities = (long *)&S390_lowcore.stfle_fac_list; + seq_puts(m, "facilities :"); + for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT) + seq_printf(m, " %d", bit); + seq_putc(m, '\n'); +} + static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe" + "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs" }; static const char * const int_hwcap_str[] = { "sie" @@ -116,6 +129,7 @@ static void show_cpu_summary(struct seq_file *m, void *v) if (int_hwcap_str[i] && (int_hwcap & (1UL << i))) seq_printf(m, "%s ", int_hwcap_str[i]); seq_puts(m, "\n"); + show_facilities(m); show_cacheinfo(m); for_each_online_cpu(cpu) { struct cpuid *id = &per_cpu(cpu_info.cpu_id, cpu); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index c14df0a1ec3c..488c5bb8dc77 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -44,30 +44,42 @@ void update_cr_regs(struct task_struct *task) struct pt_regs *regs = task_pt_regs(task); struct thread_struct *thread = &task->thread; struct per_regs old, new; - + unsigned long cr0_old, cr0_new; + unsigned long cr2_old, cr2_new; + int cr0_changed, cr2_changed; + + __ctl_store(cr0_old, 0, 0); + __ctl_store(cr2_old, 2, 2); + cr0_new = cr0_old; + cr2_new = cr2_old; /* Take care of the enable/disable of transactional execution. */ if (MACHINE_HAS_TE) { - unsigned long cr, cr_new; - - __ctl_store(cr, 0, 0); /* Set or clear transaction execution TXC bit 8. */ - cr_new = cr | (1UL << 55); + cr0_new |= (1UL << 55); if (task->thread.per_flags & PER_FLAG_NO_TE) - cr_new &= ~(1UL << 55); - if (cr_new != cr) - __ctl_load(cr_new, 0, 0); + cr0_new &= ~(1UL << 55); /* Set or clear transaction execution TDC bits 62 and 63. */ - __ctl_store(cr, 2, 2); - cr_new = cr & ~3UL; + cr2_new &= ~3UL; if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND) - cr_new |= 1UL; + cr2_new |= 1UL; else - cr_new |= 2UL; + cr2_new |= 2UL; } - if (cr_new != cr) - __ctl_load(cr_new, 2, 2); } + /* Take care of enable/disable of guarded storage. */ + if (MACHINE_HAS_GS) { + cr2_new &= ~(1UL << 4); + if (task->thread.gs_cb) + cr2_new |= (1UL << 4); + } + /* Load control register 0/2 iff changed */ + cr0_changed = cr0_new != cr0_old; + cr2_changed = cr2_new != cr2_old; + if (cr0_changed) + __ctl_load(cr0_new, 0, 0); + if (cr2_changed) + __ctl_load(cr2_new, 2, 2); /* Copy user specified PER registers */ new.control = thread->per_user.control; new.start = thread->per_user.start; @@ -1137,6 +1149,74 @@ static int s390_system_call_set(struct task_struct *target, data, 0, sizeof(unsigned int)); } +static int s390_gs_cb_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) + return -ENODATA; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static int s390_gs_cb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + target->thread.gs_cb = data; + } + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static int s390_gs_bc_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_bc_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) + return -ENODATA; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + +static int s390_gs_bc_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct gs_cb *data = target->thread.gs_bc_cb; + + if (!MACHINE_HAS_GS) + return -ENODEV; + if (!data) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + target->thread.gs_bc_cb = data; + } + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); +} + static const struct user_regset s390_regsets[] = { { .core_note_type = NT_PRSTATUS, @@ -1194,6 +1274,22 @@ static const struct user_regset s390_regsets[] = { .get = s390_vxrs_high_get, .set = s390_vxrs_high_set, }, + { + .core_note_type = NT_S390_GS_CB, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_cb_get, + .set = s390_gs_cb_set, + }, + { + .core_note_type = NT_S390_GS_BC, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_bc_get, + .set = s390_gs_bc_set, + }, }; static const struct user_regset_view user_s390_view = { @@ -1422,6 +1518,14 @@ static const struct user_regset s390_compat_regsets[] = { .get = s390_compat_regs_high_get, .set = s390_compat_regs_high_set, }, + { + .core_note_type = NT_S390_GS_CB, + .n = sizeof(struct gs_cb) / sizeof(__u64), + .size = sizeof(__u64), + .align = sizeof(__u64), + .get = s390_gs_cb_get, + .set = s390_gs_cb_set, + }, }; static const struct user_regset_view user_s390_compat_view = { diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 911dc0b49be0..3ae756c0db3d 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -339,9 +339,15 @@ static void __init setup_lowcore(void) lc->stfl_fac_list = S390_lowcore.stfl_fac_list; memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, MAX_FACILITY_BIT/8); - if (MACHINE_HAS_VX) - lc->vector_save_area_addr = - (unsigned long) &lc->vector_save_area; + if (MACHINE_HAS_VX || MACHINE_HAS_GS) { + unsigned long bits, size; + + bits = MACHINE_HAS_GS ? 11 : 10; + size = 1UL << bits; + lc->mcesad = (__u64) memblock_virt_alloc(size, size); + if (MACHINE_HAS_GS) + lc->mcesad |= bits; + } lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0]; lc->sync_enter_timer = S390_lowcore.sync_enter_timer; lc->async_enter_timer = S390_lowcore.async_enter_timer; @@ -779,6 +785,12 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_S390_VXRS_BCD; } + /* + * Guarded storage support HWCAP_S390_GS is bit 12. + */ + if (MACHINE_HAS_GS) + elf_hwcap |= HWCAP_S390_GS; + get_cpu_id(&cpu_id); add_device_randomness(&cpu_id, sizeof(cpu_id)); switch (cpu_id.machine) { diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 5dab859b0d54..363000a77ffc 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -51,6 +51,7 @@ #include <asm/os_info.h> #include <asm/sigp.h> #include <asm/idle.h> +#include <asm/nmi.h> #include "entry.h" enum { @@ -78,6 +79,8 @@ struct pcpu { static u8 boot_core_type; static struct pcpu pcpu_devices[NR_CPUS]; +static struct kmem_cache *pcpu_mcesa_cache; + unsigned int smp_cpu_mt_shift; EXPORT_SYMBOL(smp_cpu_mt_shift); @@ -188,8 +191,10 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) { unsigned long async_stack, panic_stack; + unsigned long mcesa_origin, mcesa_bits; struct lowcore *lc; + mcesa_origin = mcesa_bits = 0; if (pcpu != &pcpu_devices[0]) { pcpu->lowcore = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); @@ -197,20 +202,27 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) panic_stack = __get_free_page(GFP_KERNEL); if (!pcpu->lowcore || !panic_stack || !async_stack) goto out; + if (MACHINE_HAS_VX || MACHINE_HAS_GS) { + mcesa_origin = (unsigned long) + kmem_cache_alloc(pcpu_mcesa_cache, GFP_KERNEL); + if (!mcesa_origin) + goto out; + mcesa_bits = MACHINE_HAS_GS ? 11 : 0; + } } else { async_stack = pcpu->lowcore->async_stack - ASYNC_FRAME_OFFSET; panic_stack = pcpu->lowcore->panic_stack - PANIC_FRAME_OFFSET; + mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK; + mcesa_bits = pcpu->lowcore->mcesad & MCESA_LC_MASK; } lc = pcpu->lowcore; memcpy(lc, &S390_lowcore, 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + ASYNC_FRAME_OFFSET; lc->panic_stack = panic_stack + PANIC_FRAME_OFFSET; + lc->mcesad = mcesa_origin | mcesa_bits; lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); - if (MACHINE_HAS_VX) - lc->vector_save_area_addr = - (unsigned long) &lc->vector_save_area; if (vdso_alloc_per_cpu(lc)) goto out; lowcore_ptr[cpu] = lc; @@ -218,6 +230,9 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) return 0; out: if (pcpu != &pcpu_devices[0]) { + if (mcesa_origin) + kmem_cache_free(pcpu_mcesa_cache, + (void *) mcesa_origin); free_page(panic_stack); free_pages(async_stack, ASYNC_ORDER); free_pages((unsigned long) pcpu->lowcore, LC_ORDER); @@ -229,11 +244,17 @@ out: static void pcpu_free_lowcore(struct pcpu *pcpu) { + unsigned long mcesa_origin; + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[pcpu - pcpu_devices] = NULL; vdso_free_per_cpu(pcpu->lowcore); if (pcpu == &pcpu_devices[0]) return; + if (MACHINE_HAS_VX || MACHINE_HAS_GS) { + mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK; + kmem_cache_free(pcpu_mcesa_cache, (void *) mcesa_origin); + } free_page(pcpu->lowcore->panic_stack-PANIC_FRAME_OFFSET); free_pages(pcpu->lowcore->async_stack-ASYNC_FRAME_OFFSET, ASYNC_ORDER); free_pages((unsigned long) pcpu->lowcore, LC_ORDER); @@ -550,9 +571,11 @@ int smp_store_status(int cpu) if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; - if (!MACHINE_HAS_VX) + if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) return 0; - pa = __pa(pcpu->lowcore->vector_save_area_addr); + pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK); + if (MACHINE_HAS_GS) + pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK; if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; @@ -897,12 +920,22 @@ void __init smp_fill_possible_mask(void) void __init smp_prepare_cpus(unsigned int max_cpus) { + unsigned long size; + /* request the 0x1201 emergency signal external interrupt */ if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1201"); /* request the 0x1202 external call external interrupt */ if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); + /* create slab cache for the machine-check-extended-save-areas */ + if (MACHINE_HAS_VX || MACHINE_HAS_GS) { + size = 1UL << (MACHINE_HAS_GS ? 11 : 10); + pcpu_mcesa_cache = kmem_cache_create("nmi_save_areas", + size, size, 0, NULL); + if (!pcpu_mcesa_cache) + panic("Couldn't create nmi save area cache"); + } } void __init smp_prepare_boot_cpu(void) diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 2659b5cfeddb..54fce7b065de 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -386,5 +386,5 @@ SYSCALL(sys_mlock2,compat_sys_mlock2) SYSCALL(sys_copy_file_range,compat_sys_copy_file_range) /* 375 */ SYSCALL(sys_preadv2,compat_sys_preadv2) SYSCALL(sys_pwritev2,compat_sys_pwritev2) -NI_SYSCALL +SYSCALL(sys_s390_guarded_storage,compat_sys_s390_guarded_storage) /* 378 */ SYSCALL(sys_statx,compat_sys_statx) diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index 12b6b138e354..eefcb54872a5 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -4,6 +4,7 @@ * Martin Schwidefsky <schwidefsky@de.ibm.com>, */ +#include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/proc_fs.h> @@ -13,6 +14,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <asm/ebcdic.h> +#include <asm/debug.h> #include <asm/sysinfo.h> #include <asm/cpcmd.h> #include <asm/topology.h> @@ -485,3 +487,99 @@ void calibrate_delay(void) "%lu.%02lu BogoMIPS preset\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); } + +#ifdef CONFIG_DEBUG_FS + +#define STSI_FILE(fc, s1, s2) \ +static int stsi_open_##fc##_##s1##_##s2(struct inode *inode, struct file *file)\ +{ \ + file->private_data = (void *) get_zeroed_page(GFP_KERNEL); \ + if (!file->private_data) \ + return -ENOMEM; \ + if (stsi(file->private_data, fc, s1, s2)) { \ + free_page((unsigned long)file->private_data); \ + file->private_data = NULL; \ + return -EACCES; \ + } \ + return nonseekable_open(inode, file); \ +} \ + \ +static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ + .open = stsi_open_##fc##_##s1##_##s2, \ + .release = stsi_release, \ + .read = stsi_read, \ + .llseek = no_llseek, \ +}; + +static int stsi_release(struct inode *inode, struct file *file) +{ + free_page((unsigned long)file->private_data); + return 0; +} + +static ssize_t stsi_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) +{ + return simple_read_from_buffer(buf, size, ppos, file->private_data, PAGE_SIZE); +} + +STSI_FILE( 1, 1, 1); +STSI_FILE( 1, 2, 1); +STSI_FILE( 1, 2, 2); +STSI_FILE( 2, 2, 1); +STSI_FILE( 2, 2, 2); +STSI_FILE( 3, 2, 2); +STSI_FILE(15, 1, 2); +STSI_FILE(15, 1, 3); +STSI_FILE(15, 1, 4); +STSI_FILE(15, 1, 5); +STSI_FILE(15, 1, 6); + +struct stsi_file { + const struct file_operations *fops; + char *name; +}; + +static struct stsi_file stsi_file[] __initdata = { + {.fops = &stsi_1_1_1_fs_ops, .name = "1_1_1"}, + {.fops = &stsi_1_2_1_fs_ops, .name = "1_2_1"}, + {.fops = &stsi_1_2_2_fs_ops, .name = "1_2_2"}, + {.fops = &stsi_2_2_1_fs_ops, .name = "2_2_1"}, + {.fops = &stsi_2_2_2_fs_ops, .name = "2_2_2"}, + {.fops = &stsi_3_2_2_fs_ops, .name = "3_2_2"}, + {.fops = &stsi_15_1_2_fs_ops, .name = "15_1_2"}, + {.fops = &stsi_15_1_3_fs_ops, .name = "15_1_3"}, + {.fops = &stsi_15_1_4_fs_ops, .name = "15_1_4"}, + {.fops = &stsi_15_1_5_fs_ops, .name = "15_1_5"}, + {.fops = &stsi_15_1_6_fs_ops, .name = "15_1_6"}, +}; + +static u8 stsi_0_0_0; + +static __init int stsi_init_debugfs(void) +{ + struct dentry *stsi_root; + struct stsi_file *sf; + int lvl, i; + + stsi_root = debugfs_create_dir("stsi", arch_debugfs_dir); + if (IS_ERR_OR_NULL(stsi_root)) + return 0; + lvl = stsi(NULL, 0, 0, 0); + if (lvl > 0) + stsi_0_0_0 = lvl; + debugfs_create_u8("0_0_0", 0400, stsi_root, &stsi_0_0_0); + for (i = 0; i < ARRAY_SIZE(stsi_file); i++) { + sf = &stsi_file[i]; + debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops); + } + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) { + char link_to[10]; + + sprintf(link_to, "15_1_%d", topology_mnest_limit()); + debugfs_create_symlink("topology", stsi_root, link_to); + } + return 0; +} +device_initcall(stsi_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 17660e800e74..bb47c92476f0 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -83,6 +83,8 @@ static cpumask_t cpu_thread_map(unsigned int cpu) return mask; } +#define TOPOLOGY_CORE_BITS 64 + static void add_cpus_to_mask(struct topology_core *tl_core, struct mask_info *drawer, struct mask_info *book, @@ -91,7 +93,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, struct cpu_topology_s390 *topo; unsigned int core; - for_each_set_bit(core, &tl_core->mask[0], TOPOLOGY_CORE_BITS) { + for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) { unsigned int rcore; int lcpu, i; @@ -244,7 +246,7 @@ static void update_cpu_masks(void) void store_topology(struct sysinfo_15_1_x *info) { - stsi(info, 15, 1, min(topology_max_mnest, 4)); + stsi(info, 15, 1, topology_mnest_limit()); } static int __arch_update_cpu_topology(void) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 0f8f14199734..169558dc7daf 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -420,8 +420,8 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, save_access_regs(vcpu->run->s.regs.acrs); /* Extended save area */ - rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr, - sizeof(unsigned long)); + rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr, + sizeof(unsigned long)); /* Only bits 0-53 are used for address formation */ ext_sa_addr &= ~0x3ffUL; if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fd6cd05bb6a7..d5c5c911821a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -273,7 +273,7 @@ static void kvm_s390_cpu_feat_init(void) kvm_s390_available_subfunc.pcc); } if (test_facility(57)) /* MSA5 */ - __cpacf_query(CPACF_PPNO, (cpacf_mask_t *) + __cpacf_query(CPACF_PRNO, (cpacf_mask_t *) kvm_s390_available_subfunc.ppno); if (MACHINE_HAS_ESOP) @@ -1512,9 +1512,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; } else { if (sclp.hamax == U64_MAX) - kvm->arch.mem_limit = TASK_MAX_SIZE; + kvm->arch.mem_limit = TASK_SIZE_MAX; else - kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE, + kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, sclp.hamax + 1); kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); if (!kvm->arch.gmap) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ba427eb6f14c..ffb15bd4c593 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -17,7 +17,7 @@ int spin_retry = -1; static int __init spin_retry_init(void) { if (spin_retry < 0) - spin_retry = MACHINE_HAS_CAD ? 10 : 1000; + spin_retry = 1000; return 0; } early_initcall(spin_retry_init); @@ -32,23 +32,17 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); -static inline void _raw_compare_and_delay(unsigned int *lock, unsigned int old) -{ - asm(".insn rsy,0xeb0000000022,%0,0,%1" : : "d" (old), "Q" (*lock)); -} - void arch_spin_lock_wait(arch_spinlock_t *lp) { - unsigned int cpu = SPINLOCK_LOCKVAL; - unsigned int owner; - int count, first_diag; + int cpu = SPINLOCK_LOCKVAL; + int owner, count, first_diag; first_diag = 1; while (1) { owner = ACCESS_ONCE(lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (_raw_compare_and_swap(&lp->lock, 0, cpu)) + if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) return; continue; } @@ -61,8 +55,6 @@ void arch_spin_lock_wait(arch_spinlock_t *lp) /* Loop for a while on the lock value. */ count = spin_retry; do { - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&lp->lock, owner); owner = ACCESS_ONCE(lp->lock); } while (owner && count-- > 0); if (!owner) @@ -82,9 +74,8 @@ EXPORT_SYMBOL(arch_spin_lock_wait); void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) { - unsigned int cpu = SPINLOCK_LOCKVAL; - unsigned int owner; - int count, first_diag; + int cpu = SPINLOCK_LOCKVAL; + int owner, count, first_diag; local_irq_restore(flags); first_diag = 1; @@ -93,7 +84,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) /* Try to get the lock if it is free. */ if (!owner) { local_irq_disable(); - if (_raw_compare_and_swap(&lp->lock, 0, cpu)) + if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) return; local_irq_restore(flags); continue; @@ -107,8 +98,6 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) /* Loop for a while on the lock value. */ count = spin_retry; do { - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&lp->lock, owner); owner = ACCESS_ONCE(lp->lock); } while (owner && count-- > 0); if (!owner) @@ -128,18 +117,16 @@ EXPORT_SYMBOL(arch_spin_lock_wait_flags); int arch_spin_trylock_retry(arch_spinlock_t *lp) { - unsigned int cpu = SPINLOCK_LOCKVAL; - unsigned int owner; - int count; + int cpu = SPINLOCK_LOCKVAL; + int owner, count; for (count = spin_retry; count > 0; count--) { owner = READ_ONCE(lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (_raw_compare_and_swap(&lp->lock, 0, cpu)) + if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) return 1; - } else if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&lp->lock, owner); + } } return 0; } @@ -147,8 +134,8 @@ EXPORT_SYMBOL(arch_spin_trylock_retry); void _raw_read_lock_wait(arch_rwlock_t *rw) { - unsigned int owner, old; int count = spin_retry; + int owner, old; #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES __RAW_LOCK(&rw->lock, -1, __RAW_OP_ADD); @@ -162,12 +149,9 @@ void _raw_read_lock_wait(arch_rwlock_t *rw) } old = ACCESS_ONCE(rw->lock); owner = ACCESS_ONCE(rw->owner); - if ((int) old < 0) { - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&rw->lock, old); + if (old < 0) continue; - } - if (_raw_compare_and_swap(&rw->lock, old, old + 1)) + if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1)) return; } } @@ -175,17 +159,14 @@ EXPORT_SYMBOL(_raw_read_lock_wait); int _raw_read_trylock_retry(arch_rwlock_t *rw) { - unsigned int old; int count = spin_retry; + int old; while (count-- > 0) { old = ACCESS_ONCE(rw->lock); - if ((int) old < 0) { - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&rw->lock, old); + if (old < 0) continue; - } - if (_raw_compare_and_swap(&rw->lock, old, old + 1)) + if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1)) return 1; } return 0; @@ -194,10 +175,10 @@ EXPORT_SYMBOL(_raw_read_trylock_retry); #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES -void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev) +void _raw_write_lock_wait(arch_rwlock_t *rw, int prev) { - unsigned int owner, old; int count = spin_retry; + int owner, old; owner = 0; while (1) { @@ -209,14 +190,12 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev) old = ACCESS_ONCE(rw->lock); owner = ACCESS_ONCE(rw->owner); smp_mb(); - if ((int) old >= 0) { + if (old >= 0) { prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR); old = prev; } - if ((old & 0x7fffffff) == 0 && (int) prev >= 0) + if ((old & 0x7fffffff) == 0 && prev >= 0) break; - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&rw->lock, old); } } EXPORT_SYMBOL(_raw_write_lock_wait); @@ -225,8 +204,8 @@ EXPORT_SYMBOL(_raw_write_lock_wait); void _raw_write_lock_wait(arch_rwlock_t *rw) { - unsigned int owner, old, prev; int count = spin_retry; + int owner, old, prev; prev = 0x80000000; owner = 0; @@ -238,15 +217,13 @@ void _raw_write_lock_wait(arch_rwlock_t *rw) } old = ACCESS_ONCE(rw->lock); owner = ACCESS_ONCE(rw->owner); - if ((int) old >= 0 && - _raw_compare_and_swap(&rw->lock, old, old | 0x80000000)) + if (old >= 0 && + __atomic_cmpxchg_bool(&rw->lock, old, old | 0x80000000)) prev = old; else smp_mb(); - if ((old & 0x7fffffff) == 0 && (int) prev >= 0) + if ((old & 0x7fffffff) == 0 && prev >= 0) break; - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&rw->lock, old); } } EXPORT_SYMBOL(_raw_write_lock_wait); @@ -255,24 +232,21 @@ EXPORT_SYMBOL(_raw_write_lock_wait); int _raw_write_trylock_retry(arch_rwlock_t *rw) { - unsigned int old; int count = spin_retry; + int old; while (count-- > 0) { old = ACCESS_ONCE(rw->lock); - if (old) { - if (MACHINE_HAS_CAD) - _raw_compare_and_delay(&rw->lock, old); + if (old) continue; - } - if (_raw_compare_and_swap(&rw->lock, 0, 0x80000000)) + if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000)) return 1; } return 0; } EXPORT_SYMBOL(_raw_write_trylock_retry); -void arch_lock_relax(unsigned int cpu) +void arch_lock_relax(int cpu) { if (!cpu) return; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a07b1ec1391d..7f6db1e6c048 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -431,7 +431,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end) + from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) return -EINVAL; flush = 0; @@ -2004,20 +2004,12 @@ EXPORT_SYMBOL_GPL(gmap_shadow_page); * Called with sg->parent->shadow_lock. */ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, - unsigned long offset, pte_t *pte) + unsigned long gaddr, pte_t *pte) { struct gmap_rmap *rmap, *rnext, *head; - unsigned long gaddr, start, end, bits, raddr; - unsigned long *table; + unsigned long start, end, bits, raddr; BUG_ON(!gmap_is_shadow(sg)); - spin_lock(&sg->parent->guest_table_lock); - table = radix_tree_lookup(&sg->parent->host_to_guest, - vmaddr >> PMD_SHIFT); - gaddr = table ? __gmap_segment_gaddr(table) + offset : 0; - spin_unlock(&sg->parent->guest_table_lock); - if (!table) - return; spin_lock(&sg->guest_table_lock); if (sg->removed) { @@ -2076,7 +2068,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte, unsigned long bits) { - unsigned long offset, gaddr; + unsigned long offset, gaddr = 0; unsigned long *table; struct gmap *gmap, *sg, *next; @@ -2084,22 +2076,23 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, offset = offset * (4096 / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, - &gmap->children, list) - gmap_shadow_notify(sg, vmaddr, offset, pte); - spin_unlock(&gmap->shadow_lock); - } - if (!(bits & PGSTE_IN_BIT)) - continue; spin_lock(&gmap->guest_table_lock); table = radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); if (table) gaddr = __gmap_segment_gaddr(table) + offset; spin_unlock(&gmap->guest_table_lock); - if (table) + if (!table) + continue; + + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, gaddr, pte); + spin_unlock(&gmap->shadow_lock); + } + if (bits & PGSTE_IN_BIT) gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); } rcu_read_unlock(); diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 18d4107e10ee..b7b779c40a5b 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -211,7 +211,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if ((end <= start) || (end > TASK_SIZE)) + if ((end <= start) || (end > mm->context.asce_limit)) return 0; /* * local_irq_save() doesn't prevent pagetable teardown, but does diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 50618614881f..b017daed6887 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -89,19 +89,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct vm_unmapped_area_info info; + int rc; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) - return addr; + goto check_asce_limit; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) - return addr; + goto check_asce_limit; } info.flags = 0; @@ -113,7 +114,18 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, else info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; - return vm_unmapped_area(&info); + addr = vm_unmapped_area(&info); + if (addr & ~PAGE_MASK) + return addr; + +check_asce_limit: + if (addr + len > current->mm->context.asce_limit) { + rc = crst_table_upgrade(mm); + if (rc) + return (unsigned long) rc; + } + + return addr; } unsigned long @@ -125,13 +137,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, struct mm_struct *mm = current->mm; unsigned long addr = addr0; struct vm_unmapped_area_info info; + int rc; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) - return addr; + goto check_asce_limit; /* requesting a specific address */ if (addr) { @@ -139,7 +152,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) - return addr; + goto check_asce_limit; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; @@ -165,65 +178,20 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; addr = vm_unmapped_area(&info); + if (addr & ~PAGE_MASK) + return addr; } - return addr; -} - -int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) -{ - if (is_compat_task() || TASK_SIZE >= TASK_MAX_SIZE) - return 0; - if (!(flags & MAP_FIXED)) - addr = 0; - if ((addr + len) >= TASK_SIZE) - return crst_table_upgrade(current->mm); - return 0; -} - -static unsigned long -s390_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - unsigned long area; - int rc; - - area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); - if (!(area & ~PAGE_MASK)) - return area; - if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { - /* Upgrade the page table to 4 levels and retry. */ +check_asce_limit: + if (addr + len > current->mm->context.asce_limit) { rc = crst_table_upgrade(mm); if (rc) return (unsigned long) rc; - area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); } - return area; -} - -static unsigned long -s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) -{ - struct mm_struct *mm = current->mm; - unsigned long area; - int rc; - area = arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); - if (!(area & ~PAGE_MASK)) - return area; - if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { - /* Upgrade the page table to 4 levels and retry. */ - rc = crst_table_upgrade(mm); - if (rc) - return (unsigned long) rc; - area = arch_get_unmapped_area_topdown(filp, addr, len, - pgoff, flags); - } - return area; + return addr; } + /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -241,9 +209,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm) */ if (mmap_is_legacy()) { mm->mmap_base = mmap_base_legacy(random_factor); - mm->get_unmapped_area = s390_get_unmapped_area; + mm->get_unmapped_area = arch_get_unmapped_area; } else { mm->mmap_base = mmap_base(random_factor); - mm->get_unmapped_area = s390_get_unmapped_area_topdown; + mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 3330ea124eec..69a7b01ae746 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -13,8 +13,7 @@ #include <linux/gfp.h> #include <linux/init.h> -#define ESSA_SET_STABLE 1 -#define ESSA_SET_UNUSED 2 +#include <asm/page-states.h> static int cmma_flag = 1; diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index fc5dc33bb141..fc321c5ec30e 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -94,7 +94,7 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) new = pte_mkwrite(pte_mkdirty(new)); - if ((flags & SET_MEMORY_NX) && MACHINE_HAS_NX) + if (flags & SET_MEMORY_NX) pte_val(new) |= _PAGE_NOEXEC; else if (flags & SET_MEMORY_X) pte_val(new) &= ~_PAGE_NOEXEC; @@ -144,7 +144,7 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) new = pmd_mkwrite(pmd_mkdirty(new)); - if ((flags & SET_MEMORY_NX) && MACHINE_HAS_NX) + if (flags & SET_MEMORY_NX) pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC; else if (flags & SET_MEMORY_X) pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC; @@ -221,7 +221,7 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, new = pud_wrprotect(new); else if (flags & SET_MEMORY_RW) new = pud_mkwrite(pud_mkdirty(new)); - if ((flags & SET_MEMORY_NX) && MACHINE_HAS_NX) + if (flags & SET_MEMORY_NX) pud_val(new) |= _REGION_ENTRY_NOEXEC; else if (flags & SET_MEMORY_X) pud_val(new) &= ~_REGION_ENTRY_NOEXEC; @@ -288,6 +288,10 @@ static int change_page_attr(unsigned long addr, unsigned long end, int __set_memory(unsigned long addr, int numpages, unsigned long flags) { + if (!MACHINE_HAS_NX) + flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); + if (!flags) + return 0; addr &= PAGE_MASK; return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 995f78532cc2..f502cbe657af 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -95,7 +95,6 @@ int crst_table_upgrade(struct mm_struct *mm) mm->context.asce_limit = 1UL << 53; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; - mm->task_size = mm->context.asce_limit; spin_unlock_bh(&mm->page_table_lock); on_each_cpu(__crst_table_upgrade, mm, 0); @@ -119,7 +118,6 @@ void crst_table_downgrade(struct mm_struct *mm) mm->context.asce_limit = 1UL << 31; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - mm->task_size = mm->context.asce_limit; crst_table_free(mm, (unsigned long *) pgd); if (current->active_mm == mm) @@ -144,7 +142,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm) struct page *page; unsigned long *table; - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + page = alloc_page(GFP_KERNEL); if (page) { table = (unsigned long *) page_to_phys(page); clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 463e5ef02304..947b66a5cdba 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -23,6 +23,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/page-states.h> static inline pte_t ptep_flush_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -787,4 +788,156 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, return 0; } EXPORT_SYMBOL(get_guest_storage_key); + +/** + * pgste_perform_essa - perform ESSA actions on the PGSTE. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @orc: the specific action to perform, see the ESSA_SET_* macros. + * @oldpte: the PTE will be saved there if the pointer is not NULL. + * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. + * + * Return: 1 if the page is to be added to the CBRL, otherwise 0, + * or < 0 in case of error. -EINVAL is returned for invalid values + * of orc, -EFAULT for invalid addresses. + */ +int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste) +{ + unsigned long pgstev; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + int res = 0; + + WARN_ON_ONCE(orc > ESSA_MAX); + if (unlikely(orc > ESSA_MAX)) + return -EINVAL; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + if (oldpte) + *oldpte = pte_val(*ptep); + if (oldpgste) + *oldpgste = pgstev; + + switch (orc) { + case ESSA_GET_STATE: + break; + case ESSA_SET_STABLE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_UNUSED: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_UNUSED; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; + break; + } + if (pgstev & _PGSTE_GPS_ZERO) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + break; + } + if (!(pgstev & PGSTE_GC_BIT)) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + res = 1; + break; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + } + break; + default: + /* we should never get here! */ + break; + } + /* If we are discarding a page, set it to logical zero */ + if (res) + pgstev |= _PGSTE_GPS_ZERO; + + pgste_val(pgste) = pgstev; + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return res; +} +EXPORT_SYMBOL(pgste_perform_essa); + +/** + * set_pgste_bits - set specific PGSTE bits. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @bits: a bitmask representing the bits that will be touched + * @value: the values of the bits to be written. Only the bits in the mask + * will be written. + * + * Return: 0 on success, < 0 in case of error. + */ +int set_pgste_bits(struct mm_struct *mm, unsigned long hva, + unsigned long bits, unsigned long value) +{ + spinlock_t *ptl; + pgste_t new; + pte_t *ptep; + + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + new = pgste_get_lock(ptep); + + pgste_val(new) &= ~bits; + pgste_val(new) |= value & bits; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(set_pgste_bits); + +/** + * get_pgste - get the current PGSTE for the given address. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @pgstep: will be written with the current PGSTE for the given address. + * + * Return: 0 on success, < 0 in case of error. + */ +int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) +{ + spinlock_t *ptl; + pte_t *ptep; + + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + *pgstep = pgste_val(pgste_get(ptep)); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(get_pgste); #endif diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 364b9d824be3..8051df109db3 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -60,16 +60,8 @@ static DEFINE_SPINLOCK(zpci_domain_lock); static struct airq_iv *zpci_aisb_iv; static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES]; -/* Adapter interrupt definitions */ -static void zpci_irq_handler(struct airq_struct *airq); - -static struct airq_struct zpci_airq = { - .handler = zpci_irq_handler, - .isc = PCI_ISC, -}; - #define ZPCI_IOMAP_ENTRIES \ - min(((unsigned long) CONFIG_PCI_NR_FUNCTIONS * PCI_BAR_COUNT), \ + min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2), \ ZPCI_IOMAP_MAX_ENTRIES) static DEFINE_SPINLOCK(zpci_iomap_lock); @@ -214,8 +206,6 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) return rc; } -#define ZPCI_PCIAS_CFGSPC 15 - static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) { u64 req = ZPCI_CREATE_REQ(zdev->fh, ZPCI_PCIAS_CFGSPC, len); @@ -507,6 +497,11 @@ static void zpci_unmap_resources(struct pci_dev *pdev) } } +static struct airq_struct zpci_airq = { + .handler = zpci_irq_handler, + .isc = PCI_ISC, +}; + static int __init zpci_irq_init(void) { int rc; @@ -871,11 +866,6 @@ int zpci_report_error(struct pci_dev *pdev, } EXPORT_SYMBOL(zpci_report_error); -static inline int barsize(u8 size) -{ - return (size) ? (1 << size) >> 10 : 0; -} - static int zpci_mem_init(void) { BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) || diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 0cafe08919c9..b9918fb9587d 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -423,6 +423,20 @@ config HW_RANDOM_CAVIUM If unsure, say Y. +config HW_RANDOM_S390 + tristate "S390 True Random Number Generator support" + depends on S390 + default HW_RANDOM + ---help--- + This driver provides kernel-side support for the True + Random Number Generator available as CPACF extension + on modern s390 hardware platforms. + + To compile this driver as a module, choose M here: the + module will be called s390-trng. + + If unsure, say Y. + endif # HW_RANDOM config UML_RANDOM diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile index 5f52b1e4e7be..dd1765246255 100644 --- a/drivers/char/hw_random/Makefile +++ b/drivers/char/hw_random/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_HW_RANDOM_STM32) += stm32-rng.o obj-$(CONFIG_HW_RANDOM_PIC32) += pic32-rng.o obj-$(CONFIG_HW_RANDOM_MESON) += meson-rng.o obj-$(CONFIG_HW_RANDOM_CAVIUM) += cavium-rng.o cavium-rng-vf.o +obj-$(CONFIG_HW_RANDOM_S390) += s390-trng.o diff --git a/drivers/char/hw_random/s390-trng.c b/drivers/char/hw_random/s390-trng.c new file mode 100644 index 000000000000..aca48e893fca --- /dev/null +++ b/drivers/char/hw_random/s390-trng.c @@ -0,0 +1,268 @@ +/* + * s390 TRNG device driver + * + * Driver for the TRNG (true random number generation) command + * available via CPACF extension MSA 7 on the s390 arch. + + * Copyright IBM Corp. 2017 + * Author(s): Harald Freudenberger <freude@de.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + */ + +#define KMSG_COMPONENT "trng" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/hw_random.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/miscdevice.h> +#include <linux/debugfs.h> +#include <linux/atomic.h> +#include <linux/random.h> +#include <linux/sched/signal.h> +#include <asm/debug.h> +#include <asm/cpacf.h> + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("IBM Corporation"); +MODULE_DESCRIPTION("s390 CPACF TRNG device driver"); + + +/* trng related debug feature things */ + +static debug_info_t *debug_info; + +#define DEBUG_DBG(...) debug_sprintf_event(debug_info, 6, ##__VA_ARGS__) +#define DEBUG_INFO(...) debug_sprintf_event(debug_info, 5, ##__VA_ARGS__) +#define DEBUG_WARN(...) debug_sprintf_event(debug_info, 4, ##__VA_ARGS__) +#define DEBUG_ERR(...) debug_sprintf_event(debug_info, 3, ##__VA_ARGS__) + + +/* trng helpers */ + +static atomic64_t trng_dev_counter = ATOMIC64_INIT(0); +static atomic64_t trng_hwrng_counter = ATOMIC64_INIT(0); + + +/* file io functions */ + +static int trng_open(struct inode *inode, struct file *file) +{ + return nonseekable_open(inode, file); +} + +static ssize_t trng_read(struct file *file, char __user *ubuf, + size_t nbytes, loff_t *ppos) +{ + u8 buf[32]; + u8 *p = buf; + unsigned int n; + ssize_t ret = 0; + + /* + * use buf for requests <= sizeof(buf), + * otherwise allocate one page and fetch + * pagewise. + */ + + if (nbytes > sizeof(buf)) { + p = (u8 *) __get_free_page(GFP_KERNEL); + if (!p) + return -ENOMEM; + } + + while (nbytes) { + if (need_resched()) { + if (signal_pending(current)) { + if (ret == 0) + ret = -ERESTARTSYS; + break; + } + schedule(); + } + n = nbytes > PAGE_SIZE ? PAGE_SIZE : nbytes; + cpacf_trng(NULL, 0, p, n); + atomic64_add(n, &trng_dev_counter); + if (copy_to_user(ubuf, p, n)) { + ret = -EFAULT; + break; + } + nbytes -= n; + ubuf += n; + ret += n; + } + + if (p != buf) + free_page((unsigned long) p); + + DEBUG_DBG("trng_read()=%zd\n", ret); + return ret; +} + + +/* sysfs */ + +static ssize_t trng_counter_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 dev_counter = atomic64_read(&trng_dev_counter); + u64 hwrng_counter = atomic64_read(&trng_hwrng_counter); +#if IS_ENABLED(CONFIG_ARCH_RANDOM) + u64 arch_counter = atomic64_read(&s390_arch_random_counter); + + return snprintf(buf, PAGE_SIZE, + "trng: %llu\n" + "hwrng: %llu\n" + "arch: %llu\n" + "total: %llu\n", + dev_counter, hwrng_counter, arch_counter, + dev_counter + hwrng_counter + arch_counter); +#else + return snprintf(buf, PAGE_SIZE, + "trng: %llu\n" + "hwrng: %llu\n" + "total: %llu\n", + dev_counter, hwrng_counter, + dev_counter + hwrng_counter); +#endif +} +static DEVICE_ATTR(byte_counter, 0444, trng_counter_show, NULL); + +static struct attribute *trng_dev_attrs[] = { + &dev_attr_byte_counter.attr, + NULL +}; + +static const struct attribute_group trng_dev_attr_group = { + .attrs = trng_dev_attrs +}; + +static const struct attribute_group *trng_dev_attr_groups[] = { + &trng_dev_attr_group, + NULL +}; + +static const struct file_operations trng_fops = { + .owner = THIS_MODULE, + .open = &trng_open, + .release = NULL, + .read = &trng_read, + .llseek = noop_llseek, +}; + +static struct miscdevice trng_dev = { + .name = "trng", + .minor = MISC_DYNAMIC_MINOR, + .mode = 0444, + .fops = &trng_fops, + .groups = trng_dev_attr_groups, +}; + + +/* hwrng_register */ + +static inline void _trng_hwrng_read(u8 *buf, size_t len) +{ + cpacf_trng(NULL, 0, buf, len); + atomic64_add(len, &trng_hwrng_counter); +} + +static int trng_hwrng_data_read(struct hwrng *rng, u32 *data) +{ + size_t len = sizeof(*data); + + _trng_hwrng_read((u8 *) data, len); + + DEBUG_DBG("trng_hwrng_data_read()=%zu\n", len); + + return len; +} + +static int trng_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) +{ + size_t len = max <= PAGE_SIZE ? max : PAGE_SIZE; + + _trng_hwrng_read((u8 *) data, len); + + DEBUG_DBG("trng_hwrng_read()=%zu\n", len); + + return len; +} + +/* + * hwrng register struct + * The trng is suppost to have 100% entropy, and thus + * we register with a very high quality value. + */ +static struct hwrng trng_hwrng_dev = { + .name = "s390-trng", + .data_read = trng_hwrng_data_read, + .read = trng_hwrng_read, + .quality = 999, +}; + + +/* init and exit */ + +static void __init trng_debug_init(void) +{ + debug_info = debug_register("trng", 1, 1, 4 * sizeof(long)); + debug_register_view(debug_info, &debug_sprintf_view); + debug_set_level(debug_info, 3); +} + +static void trng_debug_exit(void) +{ + debug_unregister(debug_info); +} + +static int __init trng_init(void) +{ + int ret; + + trng_debug_init(); + + /* check if subfunction CPACF_PRNO_TRNG is available */ + if (!cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) { + DEBUG_INFO("trng_init CPACF_PRNO_TRNG not available\n"); + ret = -ENODEV; + goto out_dbg; + } + + ret = misc_register(&trng_dev); + if (ret) { + DEBUG_WARN("trng_init misc_register() failed rc=%d\n", ret); + goto out_dbg; + } + + ret = hwrng_register(&trng_hwrng_dev); + if (ret) { + DEBUG_WARN("trng_init hwrng_register() failed rc=%d\n", ret); + goto out_misc; + } + + DEBUG_DBG("trng_init successful\n"); + + return 0; + +out_misc: + misc_deregister(&trng_dev); +out_dbg: + trng_debug_exit(); + return ret; +} + +static void __exit trng_exit(void) +{ + hwrng_unregister(&trng_hwrng_dev); + misc_deregister(&trng_dev); + trng_debug_exit(); +} + +module_cpu_feature_match(MSA, trng_init); +module_exit(trng_exit); diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 37e204f3d9be..6ee3a25ae731 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -327,6 +327,14 @@ config S390_IOMMU help Support for the IOMMU API for s390 PCI devices. +config S390_CCW_IOMMU + bool "S390 CCW IOMMU Support" + depends on S390 && CCW + select IOMMU_API + help + Enables bits of IOMMU API required by VFIO. The iommu_ops + is not implemented as it is not necessary for VFIO. + config MTK_IOMMU bool "MTK IOMMU Support" depends on ARM || ARM64 diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 774da20ceb58..107cd3361e29 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -1052,8 +1052,9 @@ dasd_3990_erp_com_rej(struct dasd_ccw_req * erp, char *sense) } else { /* fatal error - set status to FAILED internal error 09 - Command Reject */ - dev_err(&device->cdev->dev, "An error occurred in the DASD " - "device driver, reason=%s\n", "09"); + if (!test_bit(DASD_CQR_SUPPRESS_CR, &erp->flags)) + dev_err(&device->cdev->dev, + "An error occurred in the DASD device driver, reason=09\n"); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); } diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 0b38217f8147..122456e4db89 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -4927,10 +4927,14 @@ static void dasd_eckd_dump_sense(struct dasd_device *device, dasd_eckd_dump_sense_tcw(device, req, irb); } else { /* - * In some cases the 'No Record Found' error might be expected - * and log messages shouldn't be written then. Check if the - * according suppress bit is set. + * In some cases the 'Command Reject' or 'No Record Found' + * error might be expected and log messages shouldn't be + * written then. Check if the according suppress bit is set. */ + if (sense && sense[0] & SNS0_CMD_REJECT && + test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) + return; + if (sense && sense[1] & SNS1_NO_REC_FOUND && test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) return; @@ -5172,6 +5176,10 @@ static int dasd_eckd_query_host_access(struct dasd_device *device, if (!device->block && private->lcu->pav == HYPER_PAV) return -EOPNOTSUPP; + /* may not be supported by the storage server */ + if (!(private->features.feature[14] & 0x80)) + return -EOPNOTSUPP; + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, sizeof(struct dasd_psf_prssd_data) + 1, device); @@ -5219,6 +5227,8 @@ static int dasd_eckd_query_host_access(struct dasd_device *device, cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; + /* the command might not be supported, suppress error message */ + __set_bit(DASD_CQR_SUPPRESS_CR, &cqr->flags); rc = dasd_sleep_on_interruptible(cqr); if (rc == 0) { *data = *host_access; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 518dba2732d5..dca7cb1e6f65 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -239,11 +239,11 @@ struct dasd_ccw_req { */ /* * The following flags are used to suppress output of certain errors. - * These flags should only be used for format checks! */ #define DASD_CQR_SUPPRESS_NRF 4 /* Suppress 'No Record Found' error */ #define DASD_CQR_SUPPRESS_FP 5 /* Suppress 'File Protected' error*/ #define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */ +#define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */ /* Signature for error recovery functions. */ typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *); diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile index 3ab9aedeb84a..bdf47526038a 100644 --- a/drivers/s390/cio/Makefile +++ b/drivers/s390/cio/Makefile @@ -17,3 +17,6 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o obj-$(CONFIG_QDIO) += qdio.o + +vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o +obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 1b350665c823..89216174fcbb 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -170,12 +170,14 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */ return ccode; } } +EXPORT_SYMBOL_GPL(cio_start_key); int cio_start (struct subchannel *sch, struct ccw1 *cpa, __u8 lpm) { return cio_start_key(sch, cpa, lpm, PAGE_DEFAULT_KEY); } +EXPORT_SYMBOL_GPL(cio_start); /* * resume suspended I/O operation @@ -208,6 +210,7 @@ cio_resume (struct subchannel *sch) return -ENODEV; } } +EXPORT_SYMBOL_GPL(cio_resume); /* * halt I/O operation @@ -241,6 +244,7 @@ cio_halt(struct subchannel *sch) return -ENODEV; } } +EXPORT_SYMBOL_GPL(cio_halt); /* * Clear I/O operation @@ -271,6 +275,7 @@ cio_clear(struct subchannel *sch) return -ENODEV; } } +EXPORT_SYMBOL_GPL(cio_clear); /* * Function: cio_cancel @@ -308,7 +313,68 @@ cio_cancel (struct subchannel *sch) return -ENODEV; } } +EXPORT_SYMBOL_GPL(cio_cancel); +/** + * cio_cancel_halt_clear - Cancel running I/O by performing cancel, halt + * and clear ordinally if subchannel is valid. + * @sch: subchannel on which to perform the cancel_halt_clear operation + * @iretry: the number of the times remained to retry the next operation + * + * This should be called repeatedly since halt/clear are asynchronous + * operations. We do one try with cio_cancel, three tries with cio_halt, + * 255 tries with cio_clear. The caller should initialize @iretry with + * the value 255 for its first call to this, and keep using the same + * @iretry in the subsequent calls until it gets a non -EBUSY return. + * + * Returns 0 if device now idle, -ENODEV for device not operational, + * -EBUSY if an interrupt is expected (either from halt/clear or from a + * status pending), and -EIO if out of retries. + */ +int cio_cancel_halt_clear(struct subchannel *sch, int *iretry) +{ + int ret; + + if (cio_update_schib(sch)) + return -ENODEV; + if (!sch->schib.pmcw.ena) + /* Not operational -> done. */ + return 0; + /* Stage 1: cancel io. */ + if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_HALT_PEND) && + !(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) { + if (!scsw_is_tm(&sch->schib.scsw)) { + ret = cio_cancel(sch); + if (ret != -EINVAL) + return ret; + } + /* + * Cancel io unsuccessful or not applicable (transport mode). + * Continue with asynchronous instructions. + */ + *iretry = 3; /* 3 halt retries. */ + } + /* Stage 2: halt io. */ + if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) { + if (*iretry) { + *iretry -= 1; + ret = cio_halt(sch); + if (ret != -EBUSY) + return (ret == 0) ? -EBUSY : ret; + } + /* Halt io unsuccessful. */ + *iretry = 255; /* 255 clear retries. */ + } + /* Stage 3: clear io. */ + if (*iretry) { + *iretry -= 1; + ret = cio_clear(sch); + return (ret == 0) ? -EBUSY : ret; + } + /* Function was unsuccessful */ + return -EIO; +} +EXPORT_SYMBOL_GPL(cio_cancel_halt_clear); static void cio_apply_config(struct subchannel *sch, struct schib *schib) { @@ -382,6 +448,7 @@ int cio_commit_config(struct subchannel *sch) } return ret; } +EXPORT_SYMBOL_GPL(cio_commit_config); /** * cio_update_schib - Perform stsch and update schib if subchannel is valid. @@ -987,6 +1054,7 @@ int cio_tm_start_key(struct subchannel *sch, struct tcw *tcw, u8 lpm, u8 key) return cio_start_handle_notoper(sch, lpm); } } +EXPORT_SYMBOL_GPL(cio_tm_start_key); /** * cio_tm_intrg - perform interrogate function @@ -1012,3 +1080,4 @@ int cio_tm_intrg(struct subchannel *sch) return -ENODEV; } } +EXPORT_SYMBOL_GPL(cio_tm_intrg); diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index f0e57aefb5f2..939596d81b73 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -123,6 +123,7 @@ extern int cio_enable_subchannel(struct subchannel *, u32); extern int cio_disable_subchannel (struct subchannel *); extern int cio_cancel (struct subchannel *); extern int cio_clear (struct subchannel *); +extern int cio_cancel_halt_clear(struct subchannel *, int *); extern int cio_resume (struct subchannel *); extern int cio_halt (struct subchannel *); extern int cio_start (struct subchannel *, struct ccw1 *, __u8); diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 9afb5ce13007..12016e32e519 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -124,14 +124,6 @@ ccw_device_set_timeout(struct ccw_device *cdev, int expires) add_timer(&cdev->private->timer); } -/* - * Cancel running i/o. This is called repeatedly since halt/clear are - * asynchronous operations. We do one try with cio_cancel, two tries - * with cio_halt, 255 tries with cio_clear. If everythings fails panic. - * Returns 0 if device now idle, -ENODEV for device not operational and - * -EBUSY if an interrupt is expected (either from halt/clear or from a - * status pending). - */ int ccw_device_cancel_halt_clear(struct ccw_device *cdev) { @@ -139,44 +131,14 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev) int ret; sch = to_subchannel(cdev->dev.parent); - if (cio_update_schib(sch)) - return -ENODEV; - if (!sch->schib.pmcw.ena) - /* Not operational -> done. */ - return 0; - /* Stage 1: cancel io. */ - if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_HALT_PEND) && - !(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) { - if (!scsw_is_tm(&sch->schib.scsw)) { - ret = cio_cancel(sch); - if (ret != -EINVAL) - return ret; - } - /* cancel io unsuccessful or not applicable (transport mode). - * Continue with asynchronous instructions. */ - cdev->private->iretry = 3; /* 3 halt retries. */ - } - if (!(scsw_actl(&sch->schib.scsw) & SCSW_ACTL_CLEAR_PEND)) { - /* Stage 2: halt io. */ - if (cdev->private->iretry) { - cdev->private->iretry--; - ret = cio_halt(sch); - if (ret != -EBUSY) - return (ret == 0) ? -EBUSY : ret; - } - /* halt io unsuccessful. */ - cdev->private->iretry = 255; /* 255 clear retries. */ - } - /* Stage 3: clear io. */ - if (cdev->private->iretry) { - cdev->private->iretry--; - ret = cio_clear (sch); - return (ret == 0) ? -EBUSY : ret; - } - /* Function was unsuccessful */ - CIO_MSG_EVENT(0, "0.%x.%04x: could not stop I/O\n", - cdev->private->dev_id.ssid, cdev->private->dev_id.devno); - return -EIO; + ret = cio_cancel_halt_clear(sch, &cdev->private->iretry); + + if (ret == -EIO) + CIO_MSG_EVENT(0, "0.%x.%04x: could not stop I/O\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno); + + return ret; } void ccw_device_update_sense_data(struct ccw_device *cdev) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c new file mode 100644 index 000000000000..ba6ac83a6c25 --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -0,0 +1,842 @@ +/* + * channel program interfaces + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com> + */ + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/vfio.h> +#include <asm/idals.h> + +#include "vfio_ccw_cp.h" + +/* + * Max length for ccw chain. + * XXX: Limit to 256, need to check more? + */ +#define CCWCHAIN_LEN_MAX 256 + +struct pfn_array { + unsigned long pa_iova; + unsigned long *pa_iova_pfn; + unsigned long *pa_pfn; + int pa_nr; +}; + +struct pfn_array_table { + struct pfn_array *pat_pa; + int pat_nr; +}; + +struct ccwchain { + struct list_head next; + struct ccw1 *ch_ccw; + /* Guest physical address of the current chain. */ + u64 ch_iova; + /* Count of the valid ccws in chain. */ + int ch_len; + /* Pinned PAGEs for the original data. */ + struct pfn_array_table *ch_pat; +}; + +/* + * pfn_array_pin() - pin user pages in memory + * @pa: pfn_array on which to perform the operation + * @mdev: the mediated device to perform pin/unpin operations + * + * Attempt to pin user pages in memory. + * + * Usage of pfn_array: + * @pa->pa_iova starting guest physical I/O address. Assigned by caller. + * @pa->pa_iova_pfn array that stores PFNs of the pages need to pin. Allocated + * by caller. + * @pa->pa_pfn array that receives PFNs of the pages pinned. Allocated by + * caller. + * @pa->pa_nr number of pages from @pa->pa_iova to pin. Assigned by + * caller. + * number of pages pinned. Assigned by callee. + * + * Returns: + * Number of pages pinned on success. + * If @pa->pa_nr is 0 or negative, returns 0. + * If no pages were pinned, returns -errno. + */ +static int pfn_array_pin(struct pfn_array *pa, struct device *mdev) +{ + int i, ret; + + if (pa->pa_nr <= 0) { + pa->pa_nr = 0; + return 0; + } + + pa->pa_iova_pfn[0] = pa->pa_iova >> PAGE_SHIFT; + for (i = 1; i < pa->pa_nr; i++) + pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1; + + ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr, + IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); + + if (ret > 0 && ret != pa->pa_nr) { + vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret); + pa->pa_nr = 0; + return 0; + } + + return ret; +} + +/* Unpin the pages before releasing the memory. */ +static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) +{ + vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); + pa->pa_nr = 0; + kfree(pa->pa_iova_pfn); +} + +/* Alloc memory for PFNs, then pin pages with them. */ +static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, + u64 iova, unsigned int len) +{ + int ret = 0; + + if (!len || pa->pa_nr) + return -EINVAL; + + pa->pa_iova = iova; + + pa->pa_nr = ((iova & ~PAGE_MASK) + len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + if (!pa->pa_nr) + return -EINVAL; + + pa->pa_iova_pfn = kcalloc(pa->pa_nr, + sizeof(*pa->pa_iova_pfn) + + sizeof(*pa->pa_pfn), + GFP_KERNEL); + if (unlikely(!pa->pa_iova_pfn)) + return -ENOMEM; + pa->pa_pfn = pa->pa_iova_pfn + pa->pa_nr; + + ret = pfn_array_pin(pa, mdev); + + if (ret > 0) + return ret; + else if (!ret) + ret = -EINVAL; + + kfree(pa->pa_iova_pfn); + + return ret; +} + +static int pfn_array_table_init(struct pfn_array_table *pat, int nr) +{ + pat->pat_pa = kcalloc(nr, sizeof(*pat->pat_pa), GFP_KERNEL); + if (unlikely(ZERO_OR_NULL_PTR(pat->pat_pa))) { + pat->pat_nr = 0; + return -ENOMEM; + } + + pat->pat_nr = nr; + + return 0; +} + +static void pfn_array_table_unpin_free(struct pfn_array_table *pat, + struct device *mdev) +{ + int i; + + for (i = 0; i < pat->pat_nr; i++) + pfn_array_unpin_free(pat->pat_pa + i, mdev); + + if (pat->pat_nr) { + kfree(pat->pat_pa); + pat->pat_pa = NULL; + pat->pat_nr = 0; + } +} + +static bool pfn_array_table_iova_pinned(struct pfn_array_table *pat, + unsigned long iova) +{ + struct pfn_array *pa = pat->pat_pa; + unsigned long iova_pfn = iova >> PAGE_SHIFT; + int i, j; + + for (i = 0; i < pat->pat_nr; i++, pa++) + for (j = 0; j < pa->pa_nr; j++) + if (pa->pa_iova_pfn[i] == iova_pfn) + return true; + + return false; +} +/* Create the list idal words for a pfn_array_table. */ +static inline void pfn_array_table_idal_create_words( + struct pfn_array_table *pat, + unsigned long *idaws) +{ + struct pfn_array *pa; + int i, j, k; + + /* + * Idal words (execept the first one) rely on the memory being 4k + * aligned. If a user virtual address is 4K aligned, then it's + * corresponding kernel physical address will also be 4K aligned. Thus + * there will be no problem here to simply use the phys to create an + * idaw. + */ + k = 0; + for (i = 0; i < pat->pat_nr; i++) { + pa = pat->pat_pa + i; + for (j = 0; j < pa->pa_nr; j++) { + idaws[k] = pa->pa_pfn[j] << PAGE_SHIFT; + if (k == 0) + idaws[k] += pa->pa_iova & (PAGE_SIZE - 1); + k++; + } + } +} + + +/* + * Within the domain (@mdev), copy @n bytes from a guest physical + * address (@iova) to a host physical address (@to). + */ +static long copy_from_iova(struct device *mdev, + void *to, u64 iova, + unsigned long n) +{ + struct pfn_array pa = {0}; + u64 from; + int i, ret; + unsigned long l, m; + + ret = pfn_array_alloc_pin(&pa, mdev, iova, n); + if (ret <= 0) + return ret; + + l = n; + for (i = 0; i < pa.pa_nr; i++) { + from = pa.pa_pfn[i] << PAGE_SHIFT; + m = PAGE_SIZE; + if (i == 0) { + from += iova & (PAGE_SIZE - 1); + m -= iova & (PAGE_SIZE - 1); + } + + m = min(l, m); + memcpy(to + (n - l), (void *)from, m); + + l -= m; + if (l == 0) + break; + } + + pfn_array_unpin_free(&pa, mdev); + + return l; +} + +static long copy_ccw_from_iova(struct channel_program *cp, + struct ccw1 *to, u64 iova, + unsigned long len) +{ + struct ccw0 ccw0; + struct ccw1 *pccw1; + int ret; + int i; + + ret = copy_from_iova(cp->mdev, to, iova, len * sizeof(struct ccw1)); + if (ret) + return ret; + + if (!cp->orb.cmd.fmt) { + pccw1 = to; + for (i = 0; i < len; i++) { + ccw0 = *(struct ccw0 *)pccw1; + if ((pccw1->cmd_code & 0x0f) == CCW_CMD_TIC) { + pccw1->cmd_code = CCW_CMD_TIC; + pccw1->flags = 0; + pccw1->count = 0; + } else { + pccw1->cmd_code = ccw0.cmd_code; + pccw1->flags = ccw0.flags; + pccw1->count = ccw0.count; + } + pccw1->cda = ccw0.cda; + pccw1++; + } + } + + return ret; +} + +/* + * Helpers to operate ccwchain. + */ +#define ccw_is_test(_ccw) (((_ccw)->cmd_code & 0x0F) == 0) + +#define ccw_is_noop(_ccw) ((_ccw)->cmd_code == CCW_CMD_NOOP) + +#define ccw_is_tic(_ccw) ((_ccw)->cmd_code == CCW_CMD_TIC) + +#define ccw_is_idal(_ccw) ((_ccw)->flags & CCW_FLAG_IDA) + + +#define ccw_is_chain(_ccw) ((_ccw)->flags & (CCW_FLAG_CC | CCW_FLAG_DC)) + +static struct ccwchain *ccwchain_alloc(struct channel_program *cp, int len) +{ + struct ccwchain *chain; + void *data; + size_t size; + + /* Make ccw address aligned to 8. */ + size = ((sizeof(*chain) + 7L) & -8L) + + sizeof(*chain->ch_ccw) * len + + sizeof(*chain->ch_pat) * len; + chain = kzalloc(size, GFP_DMA | GFP_KERNEL); + if (!chain) + return NULL; + + data = (u8 *)chain + ((sizeof(*chain) + 7L) & -8L); + chain->ch_ccw = (struct ccw1 *)data; + + data = (u8 *)(chain->ch_ccw) + sizeof(*chain->ch_ccw) * len; + chain->ch_pat = (struct pfn_array_table *)data; + + chain->ch_len = len; + + list_add_tail(&chain->next, &cp->ccwchain_list); + + return chain; +} + +static void ccwchain_free(struct ccwchain *chain) +{ + list_del(&chain->next); + kfree(chain); +} + +/* Free resource for a ccw that allocated memory for its cda. */ +static void ccwchain_cda_free(struct ccwchain *chain, int idx) +{ + struct ccw1 *ccw = chain->ch_ccw + idx; + + if (!ccw->count) + return; + + kfree((void *)(u64)ccw->cda); +} + +/* Unpin the pages then free the memory resources. */ +static void cp_unpin_free(struct channel_program *cp) +{ + struct ccwchain *chain, *temp; + int i; + + list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { + for (i = 0; i < chain->ch_len; i++) { + pfn_array_table_unpin_free(chain->ch_pat + i, + cp->mdev); + ccwchain_cda_free(chain, i); + } + ccwchain_free(chain); + } +} + +/** + * ccwchain_calc_length - calculate the length of the ccw chain. + * @iova: guest physical address of the target ccw chain + * @cp: channel_program on which to perform the operation + * + * This is the chain length not considering any TICs. + * You need to do a new round for each TIC target. + * + * Returns: the length of the ccw chain or -errno. + */ +static int ccwchain_calc_length(u64 iova, struct channel_program *cp) +{ + struct ccw1 *ccw, *p; + int cnt; + + /* + * Copy current chain from guest to host kernel. + * Currently the chain length is limited to CCWCHAIN_LEN_MAX (256). + * So copying 2K is enough (safe). + */ + p = ccw = kcalloc(CCWCHAIN_LEN_MAX, sizeof(*ccw), GFP_KERNEL); + if (!ccw) + return -ENOMEM; + + cnt = copy_ccw_from_iova(cp, ccw, iova, CCWCHAIN_LEN_MAX); + if (cnt) { + kfree(ccw); + return cnt; + } + + cnt = 0; + do { + cnt++; + + if ((!ccw_is_chain(ccw)) && (!ccw_is_tic(ccw))) + break; + + ccw++; + } while (cnt < CCWCHAIN_LEN_MAX + 1); + + if (cnt == CCWCHAIN_LEN_MAX + 1) + cnt = -EINVAL; + + kfree(p); + return cnt; +} + +static int tic_target_chain_exists(struct ccw1 *tic, struct channel_program *cp) +{ + struct ccwchain *chain; + u32 ccw_head, ccw_tail; + + list_for_each_entry(chain, &cp->ccwchain_list, next) { + ccw_head = chain->ch_iova; + ccw_tail = ccw_head + (chain->ch_len - 1) * sizeof(struct ccw1); + + if ((ccw_head <= tic->cda) && (tic->cda <= ccw_tail)) + return 1; + } + + return 0; +} + +static int ccwchain_loop_tic(struct ccwchain *chain, + struct channel_program *cp); + +static int ccwchain_handle_tic(struct ccw1 *tic, struct channel_program *cp) +{ + struct ccwchain *chain; + int len, ret; + + /* May transfer to an existing chain. */ + if (tic_target_chain_exists(tic, cp)) + return 0; + + /* Get chain length. */ + len = ccwchain_calc_length(tic->cda, cp); + if (len < 0) + return len; + + /* Need alloc a new chain for this one. */ + chain = ccwchain_alloc(cp, len); + if (!chain) + return -ENOMEM; + chain->ch_iova = tic->cda; + + /* Copy the new chain from user. */ + ret = copy_ccw_from_iova(cp, chain->ch_ccw, tic->cda, len); + if (ret) { + ccwchain_free(chain); + return ret; + } + + /* Loop for tics on this new chain. */ + return ccwchain_loop_tic(chain, cp); +} + +/* Loop for TICs. */ +static int ccwchain_loop_tic(struct ccwchain *chain, struct channel_program *cp) +{ + struct ccw1 *tic; + int i, ret; + + for (i = 0; i < chain->ch_len; i++) { + tic = chain->ch_ccw + i; + + if (!ccw_is_tic(tic)) + continue; + + ret = ccwchain_handle_tic(tic, cp); + if (ret) + return ret; + } + + return 0; +} + +static int ccwchain_fetch_tic(struct ccwchain *chain, + int idx, + struct channel_program *cp) +{ + struct ccw1 *ccw = chain->ch_ccw + idx; + struct ccwchain *iter; + u32 ccw_head, ccw_tail; + + list_for_each_entry(iter, &cp->ccwchain_list, next) { + ccw_head = iter->ch_iova; + ccw_tail = ccw_head + (iter->ch_len - 1) * sizeof(struct ccw1); + + if ((ccw_head <= ccw->cda) && (ccw->cda <= ccw_tail)) { + ccw->cda = (__u32) (addr_t) (iter->ch_ccw + + (ccw->cda - ccw_head)); + return 0; + } + } + + return -EFAULT; +} + +static int ccwchain_fetch_direct(struct ccwchain *chain, + int idx, + struct channel_program *cp) +{ + struct ccw1 *ccw; + struct pfn_array_table *pat; + unsigned long *idaws; + int idaw_nr; + + ccw = chain->ch_ccw + idx; + + /* + * Pin data page(s) in memory. + * The number of pages actually is the count of the idaws which will be + * needed when translating a direct ccw to a idal ccw. + */ + pat = chain->ch_pat + idx; + if (pfn_array_table_init(pat, 1)) + return -ENOMEM; + idaw_nr = pfn_array_alloc_pin(pat->pat_pa, cp->mdev, + ccw->cda, ccw->count); + if (idaw_nr < 0) + return idaw_nr; + + /* Translate this direct ccw to a idal ccw. */ + idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL); + if (!idaws) { + pfn_array_table_unpin_free(pat, cp->mdev); + return -ENOMEM; + } + ccw->cda = (__u32) virt_to_phys(idaws); + ccw->flags |= CCW_FLAG_IDA; + + pfn_array_table_idal_create_words(pat, idaws); + + return 0; +} + +static int ccwchain_fetch_idal(struct ccwchain *chain, + int idx, + struct channel_program *cp) +{ + struct ccw1 *ccw; + struct pfn_array_table *pat; + unsigned long *idaws; + u64 idaw_iova; + unsigned int idaw_nr, idaw_len; + int i, ret; + + ccw = chain->ch_ccw + idx; + + /* Calculate size of idaws. */ + ret = copy_from_iova(cp->mdev, &idaw_iova, ccw->cda, sizeof(idaw_iova)); + if (ret) + return ret; + idaw_nr = idal_nr_words((void *)(idaw_iova), ccw->count); + idaw_len = idaw_nr * sizeof(*idaws); + + /* Pin data page(s) in memory. */ + pat = chain->ch_pat + idx; + ret = pfn_array_table_init(pat, idaw_nr); + if (ret) + return ret; + + /* Translate idal ccw to use new allocated idaws. */ + idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL); + if (!idaws) { + ret = -ENOMEM; + goto out_unpin; + } + + ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idaw_len); + if (ret) + goto out_free_idaws; + + ccw->cda = virt_to_phys(idaws); + + for (i = 0; i < idaw_nr; i++) { + idaw_iova = *(idaws + i); + if (IS_ERR_VALUE(idaw_iova)) { + ret = -EFAULT; + goto out_free_idaws; + } + + ret = pfn_array_alloc_pin(pat->pat_pa + i, cp->mdev, + idaw_iova, 1); + if (ret < 0) + goto out_free_idaws; + } + + pfn_array_table_idal_create_words(pat, idaws); + + return 0; + +out_free_idaws: + kfree(idaws); +out_unpin: + pfn_array_table_unpin_free(pat, cp->mdev); + return ret; +} + +/* + * Fetch one ccw. + * To reduce memory copy, we'll pin the cda page in memory, + * and to get rid of the cda 2G limitiaion of ccw1, we'll translate + * direct ccws to idal ccws. + */ +static int ccwchain_fetch_one(struct ccwchain *chain, + int idx, + struct channel_program *cp) +{ + struct ccw1 *ccw = chain->ch_ccw + idx; + + if (ccw_is_test(ccw) || ccw_is_noop(ccw)) + return 0; + + if (ccw_is_tic(ccw)) + return ccwchain_fetch_tic(chain, idx, cp); + + if (ccw_is_idal(ccw)) + return ccwchain_fetch_idal(chain, idx, cp); + + return ccwchain_fetch_direct(chain, idx, cp); +} + +/** + * cp_init() - allocate ccwchains for a channel program. + * @cp: channel_program on which to perform the operation + * @mdev: the mediated device to perform pin/unpin operations + * @orb: control block for the channel program from the guest + * + * This creates one or more ccwchain(s), and copies the raw data of + * the target channel program from @orb->cmd.iova to the new ccwchain(s). + * + * Limitations: + * 1. Supports only prefetch enabled mode. + * 2. Supports idal(c64) ccw chaining. + * 3. Supports 4k idaw. + * + * Returns: + * %0 on success and a negative error value on failure. + */ +int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) +{ + u64 iova = orb->cmd.cpa; + struct ccwchain *chain; + int len, ret; + + /* + * XXX: + * Only support prefetch enable mode now. + * Only support 64bit addressing idal. + * Only support 4k IDAW. + */ + if (!orb->cmd.pfch || !orb->cmd.c64 || orb->cmd.i2k) + return -EOPNOTSUPP; + + INIT_LIST_HEAD(&cp->ccwchain_list); + memcpy(&cp->orb, orb, sizeof(*orb)); + cp->mdev = mdev; + + /* Get chain length. */ + len = ccwchain_calc_length(iova, cp); + if (len < 0) + return len; + + /* Alloc mem for the head chain. */ + chain = ccwchain_alloc(cp, len); + if (!chain) + return -ENOMEM; + chain->ch_iova = iova; + + /* Copy the head chain from guest. */ + ret = copy_ccw_from_iova(cp, chain->ch_ccw, iova, len); + if (ret) { + ccwchain_free(chain); + return ret; + } + + /* Now loop for its TICs. */ + ret = ccwchain_loop_tic(chain, cp); + if (ret) + cp_unpin_free(cp); + + return ret; +} + + +/** + * cp_free() - free resources for channel program. + * @cp: channel_program on which to perform the operation + * + * This unpins the memory pages and frees the memory space occupied by + * @cp, which must have been returned by a previous call to cp_init(). + * Otherwise, undefined behavior occurs. + */ +void cp_free(struct channel_program *cp) +{ + cp_unpin_free(cp); +} + +/** + * cp_prefetch() - translate a guest physical address channel program to + * a real-device runnable channel program. + * @cp: channel_program on which to perform the operation + * + * This function translates the guest-physical-address channel program + * and stores the result to ccwchain list. @cp must have been + * initialized by a previous call with cp_init(). Otherwise, undefined + * behavior occurs. + * + * The S/390 CCW Translation APIS (prefixed by 'cp_') are introduced + * as helpers to do ccw chain translation inside the kernel. Basically + * they accept a channel program issued by a virtual machine, and + * translate the channel program to a real-device runnable channel + * program. + * + * These APIs will copy the ccws into kernel-space buffers, and update + * the guest phsical addresses with their corresponding host physical + * addresses. Then channel I/O device drivers could issue the + * translated channel program to real devices to perform an I/O + * operation. + * + * These interfaces are designed to support translation only for + * channel programs, which are generated and formatted by a + * guest. Thus this will make it possible for things like VFIO to + * leverage the interfaces to passthrough a channel I/O mediated + * device in QEMU. + * + * We support direct ccw chaining by translating them to idal ccws. + * + * Returns: + * %0 on success and a negative error value on failure. + */ +int cp_prefetch(struct channel_program *cp) +{ + struct ccwchain *chain; + int len, idx, ret; + + list_for_each_entry(chain, &cp->ccwchain_list, next) { + len = chain->ch_len; + for (idx = 0; idx < len; idx++) { + ret = ccwchain_fetch_one(chain, idx, cp); + if (ret) + return ret; + } + } + + return 0; +} + +/** + * cp_get_orb() - get the orb of the channel program + * @cp: channel_program on which to perform the operation + * @intparm: new intparm for the returned orb + * @lpm: candidate value of the logical-path mask for the returned orb + * + * This function returns the address of the updated orb of the channel + * program. Channel I/O device drivers could use this orb to issue a + * ssch. + */ +union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm) +{ + union orb *orb; + struct ccwchain *chain; + struct ccw1 *cpa; + + orb = &cp->orb; + + orb->cmd.intparm = intparm; + orb->cmd.fmt = 1; + orb->cmd.key = PAGE_DEFAULT_KEY >> 4; + + if (orb->cmd.lpm == 0) + orb->cmd.lpm = lpm; + + chain = list_first_entry(&cp->ccwchain_list, struct ccwchain, next); + cpa = chain->ch_ccw; + orb->cmd.cpa = (__u32) __pa(cpa); + + return orb; +} + +/** + * cp_update_scsw() - update scsw for a channel program. + * @cp: channel_program on which to perform the operation + * @scsw: I/O results of the channel program and also the target to be + * updated + * + * @scsw contains the I/O results of the channel program that pointed + * to by @cp. However what @scsw->cpa stores is a host physical + * address, which is meaningless for the guest, which is waiting for + * the I/O results. + * + * This function updates @scsw->cpa to its coressponding guest physical + * address. + */ +void cp_update_scsw(struct channel_program *cp, union scsw *scsw) +{ + struct ccwchain *chain; + u32 cpa = scsw->cmd.cpa; + u32 ccw_head, ccw_tail; + + /* + * LATER: + * For now, only update the cmd.cpa part. We may need to deal with + * other portions of the schib as well, even if we don't return them + * in the ioctl directly. Path status changes etc. + */ + list_for_each_entry(chain, &cp->ccwchain_list, next) { + ccw_head = (u32)(u64)chain->ch_ccw; + ccw_tail = (u32)(u64)(chain->ch_ccw + chain->ch_len - 1); + + if ((ccw_head <= cpa) && (cpa <= ccw_tail)) { + /* + * (cpa - ccw_head) is the offset value of the host + * physical ccw to its chain head. + * Adding this value to the guest physical ccw chain + * head gets us the guest cpa. + */ + cpa = chain->ch_iova + (cpa - ccw_head); + break; + } + } + + scsw->cmd.cpa = cpa; +} + +/** + * cp_iova_pinned() - check if an iova is pinned for a ccw chain. + * @cmd: ccwchain command on which to perform the operation + * @iova: the iova to check + * + * If the @iova is currently pinned for the ccw chain, return true; + * else return false. + */ +bool cp_iova_pinned(struct channel_program *cp, u64 iova) +{ + struct ccwchain *chain; + int i; + + list_for_each_entry(chain, &cp->ccwchain_list, next) { + for (i = 0; i < chain->ch_len; i++) + if (pfn_array_table_iova_pinned(chain->ch_pat + i, + iova)) + return true; + } + + return false; +} diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h new file mode 100644 index 000000000000..7a1996b3b36d --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_cp.h @@ -0,0 +1,42 @@ +/* + * channel program interfaces + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com> + */ + +#ifndef _VFIO_CCW_CP_H_ +#define _VFIO_CCW_CP_H_ + +#include <asm/cio.h> +#include <asm/scsw.h> + +#include "orb.h" + +/** + * struct channel_program - manage information for channel program + * @ccwchain_list: list head of ccwchains + * @orb: orb for the currently processed ssch request + * @mdev: the mediated device to perform page pinning/unpinning + * + * @ccwchain_list is the head of a ccwchain list, that contents the + * translated result of the guest channel program that pointed out by + * the iova parameter when calling cp_init. + */ +struct channel_program { + struct list_head ccwchain_list; + union orb orb; + struct device *mdev; +}; + +extern int cp_init(struct channel_program *cp, struct device *mdev, + union orb *orb); +extern void cp_free(struct channel_program *cp); +extern int cp_prefetch(struct channel_program *cp); +extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm); +extern void cp_update_scsw(struct channel_program *cp, union scsw *scsw); +extern bool cp_iova_pinned(struct channel_program *cp, u64 iova); + +#endif diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c new file mode 100644 index 000000000000..e90dd43d2a55 --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -0,0 +1,308 @@ +/* + * VFIO based Physical Subchannel device driver + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/mdev.h> + +#include <asm/isc.h> + +#include "ioasm.h" +#include "css.h" +#include "vfio_ccw_private.h" + +struct workqueue_struct *vfio_ccw_work_q; + +/* + * Helpers + */ +int vfio_ccw_sch_quiesce(struct subchannel *sch) +{ + struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + DECLARE_COMPLETION_ONSTACK(completion); + int iretry, ret = 0; + + spin_lock_irq(sch->lock); + if (!sch->schib.pmcw.ena) + goto out_unlock; + ret = cio_disable_subchannel(sch); + if (ret != -EBUSY) + goto out_unlock; + + do { + iretry = 255; + + ret = cio_cancel_halt_clear(sch, &iretry); + while (ret == -EBUSY) { + /* + * Flush all I/O and wait for + * cancel/halt/clear completion. + */ + private->completion = &completion; + spin_unlock_irq(sch->lock); + + wait_for_completion_timeout(&completion, 3*HZ); + + spin_lock_irq(sch->lock); + private->completion = NULL; + flush_workqueue(vfio_ccw_work_q); + ret = cio_cancel_halt_clear(sch, &iretry); + }; + + ret = cio_disable_subchannel(sch); + } while (ret == -EBUSY); +out_unlock: + private->state = VFIO_CCW_STATE_NOT_OPER; + spin_unlock_irq(sch->lock); + return ret; +} + +static void vfio_ccw_sch_io_todo(struct work_struct *work) +{ + struct vfio_ccw_private *private; + struct subchannel *sch; + struct irb *irb; + + private = container_of(work, struct vfio_ccw_private, io_work); + irb = &private->irb; + sch = private->sch; + + if (scsw_is_solicited(&irb->scsw)) { + cp_update_scsw(&private->cp, &irb->scsw); + cp_free(&private->cp); + } + memcpy(private->io_region.irb_area, irb, sizeof(*irb)); + + if (private->io_trigger) + eventfd_signal(private->io_trigger, 1); + + if (private->mdev) + private->state = VFIO_CCW_STATE_IDLE; +} + +/* + * Sysfs interfaces + */ +static ssize_t chpids_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + struct chsc_ssd_info *ssd = &sch->ssd_info; + ssize_t ret = 0; + int chp; + int mask; + + for (chp = 0; chp < 8; chp++) { + mask = 0x80 >> chp; + if (ssd->path_mask & mask) + ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id); + else + ret += sprintf(buf + ret, "00 "); + } + ret += sprintf(buf+ret, "\n"); + return ret; +} + +static ssize_t pimpampom_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + struct pmcw *pmcw = &sch->schib.pmcw; + + return sprintf(buf, "%02x %02x %02x\n", + pmcw->pim, pmcw->pam, pmcw->pom); +} + +static DEVICE_ATTR(chpids, 0444, chpids_show, NULL); +static DEVICE_ATTR(pimpampom, 0444, pimpampom_show, NULL); + +static struct attribute *vfio_subchannel_attrs[] = { + &dev_attr_chpids.attr, + &dev_attr_pimpampom.attr, + NULL, +}; + +static struct attribute_group vfio_subchannel_attr_group = { + .attrs = vfio_subchannel_attrs, +}; + +/* + * Css driver callbacks + */ +static void vfio_ccw_sch_irq(struct subchannel *sch) +{ + struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + + inc_irq_stat(IRQIO_CIO); + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); +} + +static int vfio_ccw_sch_probe(struct subchannel *sch) +{ + struct pmcw *pmcw = &sch->schib.pmcw; + struct vfio_ccw_private *private; + int ret; + + if (pmcw->qf) { + dev_warn(&sch->dev, "vfio: ccw: does not support QDIO: %s\n", + dev_name(&sch->dev)); + return -ENODEV; + } + + private = kzalloc(sizeof(*private), GFP_KERNEL | GFP_DMA); + if (!private) + return -ENOMEM; + private->sch = sch; + dev_set_drvdata(&sch->dev, private); + + spin_lock_irq(sch->lock); + private->state = VFIO_CCW_STATE_NOT_OPER; + sch->isc = VFIO_CCW_ISC; + ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch); + spin_unlock_irq(sch->lock); + if (ret) + goto out_free; + + ret = sysfs_create_group(&sch->dev.kobj, &vfio_subchannel_attr_group); + if (ret) + goto out_disable; + + ret = vfio_ccw_mdev_reg(sch); + if (ret) + goto out_rm_group; + + INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); + atomic_set(&private->avail, 1); + private->state = VFIO_CCW_STATE_STANDBY; + + return 0; + +out_rm_group: + sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group); +out_disable: + cio_disable_subchannel(sch); +out_free: + dev_set_drvdata(&sch->dev, NULL); + kfree(private); + return ret; +} + +static int vfio_ccw_sch_remove(struct subchannel *sch) +{ + struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + + vfio_ccw_sch_quiesce(sch); + + vfio_ccw_mdev_unreg(sch); + + sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group); + + dev_set_drvdata(&sch->dev, NULL); + + kfree(private); + + return 0; +} + +static void vfio_ccw_sch_shutdown(struct subchannel *sch) +{ + vfio_ccw_sch_quiesce(sch); +} + +/** + * vfio_ccw_sch_event - process subchannel event + * @sch: subchannel + * @process: non-zero if function is called in process context + * + * An unspecified event occurred for this subchannel. Adjust data according + * to the current operational state of the subchannel. Return zero when the + * event has been handled sufficiently or -EAGAIN when this function should + * be called again in process context. + */ +static int vfio_ccw_sch_event(struct subchannel *sch, int process) +{ + struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + unsigned long flags; + + spin_lock_irqsave(sch->lock, flags); + if (!device_is_registered(&sch->dev)) + goto out_unlock; + + if (work_pending(&sch->todo_work)) + goto out_unlock; + + if (cio_update_schib(sch)) { + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); + goto out_unlock; + } + + private = dev_get_drvdata(&sch->dev); + if (private->state == VFIO_CCW_STATE_NOT_OPER) { + private->state = private->mdev ? VFIO_CCW_STATE_IDLE : + VFIO_CCW_STATE_STANDBY; + } + +out_unlock: + spin_unlock_irqrestore(sch->lock, flags); + + return 0; +} + +static struct css_device_id vfio_ccw_sch_ids[] = { + { .match_flags = 0x1, .type = SUBCHANNEL_TYPE_IO, }, + { /* end of list */ }, +}; +MODULE_DEVICE_TABLE(css, vfio_ccw_sch_ids); + +static struct css_driver vfio_ccw_sch_driver = { + .drv = { + .name = "vfio_ccw", + .owner = THIS_MODULE, + }, + .subchannel_type = vfio_ccw_sch_ids, + .irq = vfio_ccw_sch_irq, + .probe = vfio_ccw_sch_probe, + .remove = vfio_ccw_sch_remove, + .shutdown = vfio_ccw_sch_shutdown, + .sch_event = vfio_ccw_sch_event, +}; + +static int __init vfio_ccw_sch_init(void) +{ + int ret; + + vfio_ccw_work_q = create_singlethread_workqueue("vfio-ccw"); + if (!vfio_ccw_work_q) + return -ENOMEM; + + isc_register(VFIO_CCW_ISC); + ret = css_driver_register(&vfio_ccw_sch_driver); + if (ret) { + isc_unregister(VFIO_CCW_ISC); + destroy_workqueue(vfio_ccw_work_q); + } + + return ret; +} + +static void __exit vfio_ccw_sch_exit(void) +{ + css_driver_unregister(&vfio_ccw_sch_driver); + isc_unregister(VFIO_CCW_ISC); + destroy_workqueue(vfio_ccw_work_q); +} +module_init(vfio_ccw_sch_init); +module_exit(vfio_ccw_sch_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c new file mode 100644 index 000000000000..80a0559cd7ce --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -0,0 +1,203 @@ +/* + * Finite state machine for vfio-ccw device handling + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + */ + +#include <linux/vfio.h> +#include <linux/mdev.h> + +#include "ioasm.h" +#include "vfio_ccw_private.h" + +static int fsm_io_helper(struct vfio_ccw_private *private) +{ + struct subchannel *sch; + union orb *orb; + int ccode; + __u8 lpm; + unsigned long flags; + + sch = private->sch; + + spin_lock_irqsave(sch->lock, flags); + private->state = VFIO_CCW_STATE_BUSY; + spin_unlock_irqrestore(sch->lock, flags); + + orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm); + + /* Issue "Start Subchannel" */ + ccode = ssch(sch->schid, orb); + + switch (ccode) { + case 0: + /* + * Initialize device status information + */ + sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND; + return 0; + case 1: /* Status pending */ + case 2: /* Busy */ + return -EBUSY; + case 3: /* Device/path not operational */ + { + lpm = orb->cmd.lpm; + if (lpm != 0) + sch->lpm &= ~lpm; + else + sch->lpm = 0; + + if (cio_update_schib(sch)) + return -ENODEV; + + return sch->lpm ? -EACCES : -ENODEV; + } + default: + return ccode; + } +} + +static void fsm_notoper(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + struct subchannel *sch = private->sch; + + /* + * TODO: + * Probably we should send the machine check to the guest. + */ + css_sched_sch_todo(sch, SCH_TODO_UNREG); + private->state = VFIO_CCW_STATE_NOT_OPER; +} + +/* + * No operation action. + */ +static void fsm_nop(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ +} + +static void fsm_io_error(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + pr_err("vfio-ccw: FSM: I/O request from state:%d\n", private->state); + private->io_region.ret_code = -EIO; +} + +static void fsm_io_busy(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + private->io_region.ret_code = -EBUSY; +} + +static void fsm_disabled_irq(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + struct subchannel *sch = private->sch; + + /* + * An interrupt in a disabled state means a previous disable was not + * successful - should not happen, but we try to disable again. + */ + cio_disable_subchannel(sch); +} + +/* + * Deal with the ccw command request from the userspace. + */ +static void fsm_io_request(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + union orb *orb; + union scsw *scsw = &private->scsw; + struct ccw_io_region *io_region = &private->io_region; + struct mdev_device *mdev = private->mdev; + + private->state = VFIO_CCW_STATE_BOXED; + + memcpy(scsw, io_region->scsw_area, sizeof(*scsw)); + + if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) { + orb = (union orb *)io_region->orb_area; + + io_region->ret_code = cp_init(&private->cp, mdev_dev(mdev), + orb); + if (io_region->ret_code) + goto err_out; + + io_region->ret_code = cp_prefetch(&private->cp); + if (io_region->ret_code) { + cp_free(&private->cp); + goto err_out; + } + + /* Start channel program and wait for I/O interrupt. */ + io_region->ret_code = fsm_io_helper(private); + if (io_region->ret_code) { + cp_free(&private->cp); + goto err_out; + } + return; + } else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) { + /* XXX: Handle halt. */ + io_region->ret_code = -EOPNOTSUPP; + goto err_out; + } else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) { + /* XXX: Handle clear. */ + io_region->ret_code = -EOPNOTSUPP; + goto err_out; + } + +err_out: + private->state = VFIO_CCW_STATE_IDLE; +} + +/* + * Got an interrupt for a normal io (state busy). + */ +static void fsm_irq(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + struct irb *irb = this_cpu_ptr(&cio_irb); + + memcpy(&private->irb, irb, sizeof(*irb)); + + queue_work(vfio_ccw_work_q, &private->io_work); + + if (private->completion) + complete(private->completion); +} + +/* + * Device statemachine + */ +fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS] = { + [VFIO_CCW_STATE_NOT_OPER] = { + [VFIO_CCW_EVENT_NOT_OPER] = fsm_nop, + [VFIO_CCW_EVENT_IO_REQ] = fsm_io_error, + [VFIO_CCW_EVENT_INTERRUPT] = fsm_disabled_irq, + }, + [VFIO_CCW_STATE_STANDBY] = { + [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, + [VFIO_CCW_EVENT_IO_REQ] = fsm_io_error, + [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, + }, + [VFIO_CCW_STATE_IDLE] = { + [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, + [VFIO_CCW_EVENT_IO_REQ] = fsm_io_request, + [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, + }, + [VFIO_CCW_STATE_BOXED] = { + [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, + [VFIO_CCW_EVENT_IO_REQ] = fsm_io_busy, + [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, + }, + [VFIO_CCW_STATE_BUSY] = { + [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, + [VFIO_CCW_EVENT_IO_REQ] = fsm_io_busy, + [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, + }, +}; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c new file mode 100644 index 000000000000..e72abbc18ee3 --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -0,0 +1,425 @@ +/* + * Physical device callbacks for vfio_ccw + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com> + */ + +#include <linux/vfio.h> +#include <linux/mdev.h> + +#include "vfio_ccw_private.h" + +static int vfio_ccw_mdev_reset(struct mdev_device *mdev) +{ + struct vfio_ccw_private *private; + struct subchannel *sch; + int ret; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); + sch = private->sch; + /* + * TODO: + * In the cureent stage, some things like "no I/O running" and "no + * interrupt pending" are clear, but we are not sure what other state + * we need to care about. + * There are still a lot more instructions need to be handled. We + * should come back here later. + */ + ret = vfio_ccw_sch_quiesce(sch); + if (ret) + return ret; + + ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch); + if (!ret) + private->state = VFIO_CCW_STATE_IDLE; + + return ret; +} + +static int vfio_ccw_mdev_notifier(struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct vfio_ccw_private *private = + container_of(nb, struct vfio_ccw_private, nb); + + /* + * Vendor drivers MUST unpin pages in response to an + * invalidation. + */ + if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) { + struct vfio_iommu_type1_dma_unmap *unmap = data; + + if (!cp_iova_pinned(&private->cp, unmap->iova)) + return NOTIFY_OK; + + if (vfio_ccw_mdev_reset(private->mdev)) + return NOTIFY_BAD; + + cp_free(&private->cp); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + return sprintf(buf, "I/O subchannel (Non-QDIO)\n"); +} +MDEV_TYPE_ATTR_RO(name); + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_CCW_STRING); +} +MDEV_TYPE_ATTR_RO(device_api); + +static ssize_t available_instances_show(struct kobject *kobj, + struct device *dev, char *buf) +{ + struct vfio_ccw_private *private = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", atomic_read(&private->avail)); +} +MDEV_TYPE_ATTR_RO(available_instances); + +static struct attribute *mdev_types_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_device_api.attr, + &mdev_type_attr_available_instances.attr, + NULL, +}; + +static struct attribute_group mdev_type_group = { + .name = "io", + .attrs = mdev_types_attrs, +}; + +struct attribute_group *mdev_type_groups[] = { + &mdev_type_group, + NULL, +}; + +static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev) +{ + struct vfio_ccw_private *private = + dev_get_drvdata(mdev_parent_dev(mdev)); + + if (private->state == VFIO_CCW_STATE_NOT_OPER) + return -ENODEV; + + if (atomic_dec_if_positive(&private->avail) < 0) + return -EPERM; + + private->mdev = mdev; + private->state = VFIO_CCW_STATE_IDLE; + + return 0; +} + +static int vfio_ccw_mdev_remove(struct mdev_device *mdev) +{ + struct vfio_ccw_private *private = + dev_get_drvdata(mdev_parent_dev(mdev)); + + if ((private->state != VFIO_CCW_STATE_NOT_OPER) && + (private->state != VFIO_CCW_STATE_STANDBY)) { + if (!vfio_ccw_mdev_reset(mdev)) + private->state = VFIO_CCW_STATE_STANDBY; + /* The state will be NOT_OPER on error. */ + } + + private->mdev = NULL; + atomic_inc(&private->avail); + + return 0; +} + +static int vfio_ccw_mdev_open(struct mdev_device *mdev) +{ + struct vfio_ccw_private *private = + dev_get_drvdata(mdev_parent_dev(mdev)); + unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; + + private->nb.notifier_call = vfio_ccw_mdev_notifier; + + return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &events, &private->nb); +} + +void vfio_ccw_mdev_release(struct mdev_device *mdev) +{ + struct vfio_ccw_private *private = + dev_get_drvdata(mdev_parent_dev(mdev)); + + vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &private->nb); +} + +static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev, + char __user *buf, + size_t count, + loff_t *ppos) +{ + struct vfio_ccw_private *private; + struct ccw_io_region *region; + + if (*ppos + count > sizeof(*region)) + return -EINVAL; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); + region = &private->io_region; + if (copy_to_user(buf, (void *)region + *ppos, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + struct vfio_ccw_private *private; + struct ccw_io_region *region; + + if (*ppos + count > sizeof(*region)) + return -EINVAL; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); + if (private->state != VFIO_CCW_STATE_IDLE) + return -EACCES; + + region = &private->io_region; + if (copy_from_user((void *)region + *ppos, buf, count)) + return -EFAULT; + + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ); + if (region->ret_code != 0) { + private->state = VFIO_CCW_STATE_IDLE; + return region->ret_code; + } + + return count; +} + +static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info) +{ + info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET; + info->num_regions = VFIO_CCW_NUM_REGIONS; + info->num_irqs = VFIO_CCW_NUM_IRQS; + + return 0; +} + +static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info, + u16 *cap_type_id, + void **cap_type) +{ + switch (info->index) { + case VFIO_CCW_CONFIG_REGION_INDEX: + info->offset = 0; + info->size = sizeof(struct ccw_io_region); + info->flags = VFIO_REGION_INFO_FLAG_READ + | VFIO_REGION_INFO_FLAG_WRITE; + return 0; + default: + return -EINVAL; + } +} + +int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info) +{ + if (info->index != VFIO_CCW_IO_IRQ_INDEX) + return -EINVAL; + + info->count = 1; + info->flags = VFIO_IRQ_INFO_EVENTFD; + + return 0; +} + +static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, + uint32_t flags, + void __user *data) +{ + struct vfio_ccw_private *private; + struct eventfd_ctx **ctx; + + if (!(flags & VFIO_IRQ_SET_ACTION_TRIGGER)) + return -EINVAL; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); + ctx = &private->io_trigger; + + switch (flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { + case VFIO_IRQ_SET_DATA_NONE: + { + if (*ctx) + eventfd_signal(*ctx, 1); + return 0; + } + case VFIO_IRQ_SET_DATA_BOOL: + { + uint8_t trigger; + + if (get_user(trigger, (uint8_t __user *)data)) + return -EFAULT; + + if (trigger && *ctx) + eventfd_signal(*ctx, 1); + return 0; + } + case VFIO_IRQ_SET_DATA_EVENTFD: + { + int32_t fd; + + if (get_user(fd, (int32_t __user *)data)) + return -EFAULT; + + if (fd == -1) { + if (*ctx) + eventfd_ctx_put(*ctx); + *ctx = NULL; + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + + if (*ctx) + eventfd_ctx_put(*ctx); + + *ctx = efdctx; + } else + return -EINVAL; + + return 0; + } + default: + return -EINVAL; + } +} + +static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, + unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + unsigned long minsz; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = vfio_ccw_mdev_get_device_info(&info); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_REGION_INFO: + { + struct vfio_region_info info; + u16 cap_type_id = 0; + void *cap_type = NULL; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = vfio_ccw_mdev_get_region_info(&info, &cap_type_id, + &cap_type); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz || info.index >= VFIO_CCW_NUM_IRQS) + return -EINVAL; + + ret = vfio_ccw_mdev_get_irq_info(&info); + if (ret) + return ret; + + if (info.count == -1) + return -EINVAL; + + return copy_to_user((void __user *)arg, &info, minsz); + } + case VFIO_DEVICE_SET_IRQS: + { + struct vfio_irq_set hdr; + size_t data_size; + void __user *data; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + ret = vfio_set_irqs_validate_and_prepare(&hdr, 1, + VFIO_CCW_NUM_IRQS, + &data_size); + if (ret) + return ret; + + data = (void __user *)(arg + minsz); + return vfio_ccw_mdev_set_irqs(mdev, hdr.flags, data); + } + case VFIO_DEVICE_RESET: + return vfio_ccw_mdev_reset(mdev); + default: + return -ENOTTY; + } +} + +static const struct mdev_parent_ops vfio_ccw_mdev_ops = { + .owner = THIS_MODULE, + .supported_type_groups = mdev_type_groups, + .create = vfio_ccw_mdev_create, + .remove = vfio_ccw_mdev_remove, + .open = vfio_ccw_mdev_open, + .release = vfio_ccw_mdev_release, + .read = vfio_ccw_mdev_read, + .write = vfio_ccw_mdev_write, + .ioctl = vfio_ccw_mdev_ioctl, +}; + +int vfio_ccw_mdev_reg(struct subchannel *sch) +{ + return mdev_register_device(&sch->dev, &vfio_ccw_mdev_ops); +} + +void vfio_ccw_mdev_unreg(struct subchannel *sch) +{ + mdev_unregister_device(&sch->dev); +} diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h new file mode 100644 index 000000000000..fc0f01c16ef9 --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -0,0 +1,96 @@ +/* + * Private stuff for vfio_ccw driver + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + * Xiao Feng Ren <renxiaof@linux.vnet.ibm.com> + */ + +#ifndef _VFIO_CCW_PRIVATE_H_ +#define _VFIO_CCW_PRIVATE_H_ + +#include <linux/completion.h> +#include <linux/eventfd.h> +#include <linux/workqueue.h> +#include <linux/vfio_ccw.h> + +#include "css.h" +#include "vfio_ccw_cp.h" + +/** + * struct vfio_ccw_private + * @sch: pointer to the subchannel + * @state: internal state of the device + * @completion: synchronization helper of the I/O completion + * @avail: available for creating a mediated device + * @mdev: pointer to the mediated device + * @nb: notifier for vfio events + * @io_region: MMIO region to input/output I/O arguments/results + * @cp: channel program for the current I/O operation + * @irb: irb info received from interrupt + * @scsw: scsw info + * @io_trigger: eventfd ctx for signaling userspace I/O results + * @io_work: work for deferral process of I/O handling + */ +struct vfio_ccw_private { + struct subchannel *sch; + int state; + struct completion *completion; + atomic_t avail; + struct mdev_device *mdev; + struct notifier_block nb; + struct ccw_io_region io_region; + + struct channel_program cp; + struct irb irb; + union scsw scsw; + + struct eventfd_ctx *io_trigger; + struct work_struct io_work; +} __aligned(8); + +extern int vfio_ccw_mdev_reg(struct subchannel *sch); +extern void vfio_ccw_mdev_unreg(struct subchannel *sch); + +extern int vfio_ccw_sch_quiesce(struct subchannel *sch); + +/* + * States of the device statemachine. + */ +enum vfio_ccw_state { + VFIO_CCW_STATE_NOT_OPER, + VFIO_CCW_STATE_STANDBY, + VFIO_CCW_STATE_IDLE, + VFIO_CCW_STATE_BOXED, + VFIO_CCW_STATE_BUSY, + /* last element! */ + NR_VFIO_CCW_STATES +}; + +/* + * Asynchronous events of the device statemachine. + */ +enum vfio_ccw_event { + VFIO_CCW_EVENT_NOT_OPER, + VFIO_CCW_EVENT_IO_REQ, + VFIO_CCW_EVENT_INTERRUPT, + /* last element! */ + NR_VFIO_CCW_EVENTS +}; + +/* + * Action called through jumptable. + */ +typedef void (fsm_func_t)(struct vfio_ccw_private *, enum vfio_ccw_event); +extern fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS]; + +static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private, + int event) +{ + vfio_ccw_jumptable[private->state][event](private, event); +} + +extern struct workqueue_struct *vfio_ccw_work_q; + +#endif diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 058db724b5a2..ea86da8c75f9 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -80,7 +80,7 @@ struct secaeskeytoken { * token. If keybitsize is given, the bitsize of the key is * also checked. Returns 0 on success or errno value on failure. */ -static int check_secaeskeytoken(u8 *token, int keybitsize) +static int check_secaeskeytoken(const u8 *token, int keybitsize) { struct secaeskeytoken *t = (struct secaeskeytoken *) token; @@ -1004,6 +1004,53 @@ int pkey_skey2pkey(const struct pkey_seckey *seckey, EXPORT_SYMBOL(pkey_skey2pkey); /* + * Verify key and give back some info about the key. + */ +int pkey_verifykey(const struct pkey_seckey *seckey, + u16 *pcardnr, u16 *pdomain, + u16 *pkeysize, u32 *pattributes) +{ + struct secaeskeytoken *t = (struct secaeskeytoken *) seckey; + u16 cardnr, domain; + u64 mkvp[2]; + int rc; + + /* check the secure key for valid AES secure key */ + rc = check_secaeskeytoken((u8 *) seckey, 0); + if (rc) + goto out; + if (pattributes) + *pattributes = PKEY_VERIFY_ATTR_AES; + if (pkeysize) + *pkeysize = t->bitsize; + + /* try to find a card which can handle this key */ + rc = pkey_findcard(seckey, &cardnr, &domain, 1); + if (rc) + goto out; + + /* check mkvp for old mkvp match */ + rc = mkvp_cache_fetch(cardnr, domain, mkvp); + if (rc) + goto out; + if (t->mkvp == mkvp[1]) { + DEBUG_DBG("pkey_verifykey secure key has old mkvp\n"); + if (pattributes) + *pattributes |= PKEY_VERIFY_ATTR_OLD_MKVP; + } + + if (pcardnr) + *pcardnr = cardnr; + if (pdomain) + *pdomain = domain; + +out: + DEBUG_DBG("pkey_verifykey rc=%d\n", rc); + return rc; +} +EXPORT_SYMBOL(pkey_verifykey); + +/* * File io functions */ @@ -1104,6 +1151,21 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd, return -EFAULT; break; } + case PKEY_VERIFYKEY: { + struct pkey_verifykey __user *uvk = (void __user *) arg; + struct pkey_verifykey kvk; + + if (copy_from_user(&kvk, uvk, sizeof(kvk))) + return -EFAULT; + rc = pkey_verifykey(&kvk.seckey, &kvk.cardnr, &kvk.domain, + &kvk.keysize, &kvk.attributes); + DEBUG_DBG("pkey_ioctl pkey_verifykey()=%d\n", rc); + if (rc) + break; + if (copy_to_user(uvk, &kvk, sizeof(kvk))) + return -EFAULT; + break; + } default: /* unknown/unsupported ioctl cmd */ return -ENOTTY; diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b59ee077a596..176b6cb1008d 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -409,6 +409,8 @@ typedef struct elf64_shdr { #define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ #define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ #define NT_ARM_TLS 0x401 /* ARM TLS register */ #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 519eff362c1c..ae461050661a 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; @@ -212,6 +213,7 @@ struct vfio_device_info { #define VFIO_DEVICE_API_PCI_STRING "vfio-pci" #define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform" #define VFIO_DEVICE_API_AMBA_STRING "vfio-amba" +#define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, @@ -446,6 +448,22 @@ enum { VFIO_PCI_NUM_IRQS }; +/* + * The vfio-ccw bus driver makes use of the following fixed region and + * IRQ index mapping. Unimplemented regions return a size of zero. + * Unimplemented IRQ types return a count of zero. + */ + +enum { + VFIO_CCW_CONFIG_REGION_INDEX, + VFIO_CCW_NUM_REGIONS +}; + +enum { + VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_NUM_IRQS +}; + /** * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h new file mode 100644 index 000000000000..34a7f6f9e065 --- /dev/null +++ b/include/uapi/linux/vfio_ccw.h @@ -0,0 +1,24 @@ +/* + * Interfaces for vfio-ccw + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> + */ + +#ifndef _VFIO_CCW_H_ +#define _VFIO_CCW_H_ + +#include <linux/types.h> + +struct ccw_io_region { +#define ORB_AREA_SIZE 12 + __u8 orb_area[ORB_AREA_SIZE]; +#define SCSW_AREA_SIZE 12 + __u8 scsw_area[SCSW_AREA_SIZE]; +#define IRB_AREA_SIZE 96 + __u8 irb_area[IRB_AREA_SIZE]; + __u32 ret_code; +} __packed; + +#endif |